#================================================#
# fluff.py #
# Author: Fredrik Boulund #
# Date: 2011-07-18 #
# Version 95 #
# #
# Contains the following functions: #
# parse_hmmsearch_output #
# parse_sequence_positions_from_hmmsearch #
# count_nucleotides_from_hmmsearch #
# fixfasta #
# classify_qnr #
# extend_sequences_from_hmmsearch #
# retrieve_sequences_from_hmmsearch #
# retrieve_sequences_from_db #
# uniqueify_seqids #
# run_blastclust #
# parse_blastclust #
# deuniqueify_seqids #
# malign_clusters #
# limit_sequence_length #
# retrieve_sequences_from_fasta #
# cleanup #
# #
# Contains the following custom exceptions: #
# PathError #
# ParseError #
#================================================#
#
# Copyright (C) 2011 Fredrik Boulund
#
# HERE WILL BE LICENSE INFORMATION
##-----------------------------------------------##
## PARSE HMMSEARCH OUTPUT ##
##-----------------------------------------------##
[docs]def parse_hmmsearch_output(filename,MIN_SCORE=0):
'''
Parses and retrieves sequence score, maximal domain score
and sequence IDs from an hmmsearch output file.
Input is:
file filename string to hmmsearch output
MIN_SCORE a float with minimum score threshold
Output:
returntuple nested tuple with the following contents;
score_id_tuples, dbpath
where score_id_tuples contains triplets with
sequence score, domain score, sequence id,
and dbpath contains a string with the path
to the database searched by hmmsearch.
Errors:
ParseError raised if the file does not conform with
hmmsearch output format (i.e. does not begin
with '# hmmsearch ::'.
ValueError raised if no sequence in the hmmsearch output
is found with score >= MIN_SCORE.
'''
from os import path
import re
MIN_SCORE = float(MIN_SCORE)
# Initialize the list of sequence IDs
sequence_ids = []
sequence_scores = []
domain_scores = []
file = open(filename,"r")
line = file.readline()
if not line.startswith("# hmmsearch ::"):
raise ParseError("NOTE: File is not a valid hmmsearch output file: "+filename)
while file:
line = file.readline()
# Extract the path to the database where to collect the sequence data
dbpattern = r"# target sequence database:\s*([\w\d\./-]*)"
foundpath = re.match(dbpattern,line)
if foundpath is not(None):
dbpath = path.abspath(foundpath.group(1))
#print dbpath #Troubleshooting
# Match to find the sequence score, maximum domain score and sequence ID
# from the list of results
pattern = r'^.{13,14}?(\d+\.\d).{18,20}\s(\d+\.\d).{17}\s([\w\d\._|-]*)'
found = re.match(pattern,line) # Find the first match to the pattern
# If the DOMAIN score is higher than or equal MIN_SCORE count it as a
# preliminary hit and store the sequence ID and scores.
if found is not None and float(found.group(2)) >= float(MIN_SCORE):
sequence_scores.append(found.group(1))
domain_scores.append(found.group(2))
# Attach the database name to the sequence id for easier identification
sequence_ids.append(found.group(3))
#print found.group(1),found.group(2),found.group(3) #Troubleshooting
elif line.startswith(">>") or line.startswith("//"): # Now we stop reading the file!
break
file.close()
score_id_tuples = zip(sequence_scores,domain_scores,sequence_ids)
if score_id_tuples == []:
raise ValueError
else:
#print "Found",len(sequence_ids),"sequences above score",MIN_SCORE
returntuple = (score_id_tuples,dbpath)
return returntuple
############## END parse_hmmsearch_output
##-------------------------------------------------------------##
## COUNT NUCLEOTIDES SEARCHED FROM HMMSEARCH OUTPUT ##
##-------------------------------------------------------------##
[docs]def count_nucleotides_from_hmmsearch(hmmsearchfiles):
'''
Takes hmmsearch-files and summarizes the number of nucleotides searched.
Input:
hmmsearchfiles a list of hmmsearchfiles (paths)
Output:
total_residues an integer with the number of nucleotides searched
'''
import re
regex = re.compile(r'\((\d+) residues\)')
total_residues = 0
for hmmsearchfile in hmmsearchfiles:
residues = 0
for line in open(hmmsearchfile):
hit = re.search(regex,line)
if hit is not None:
residues += int(hit.group(1))
total_residues += residues
return total_residues
############## END count_nucleotides_from_hmmsearch
##-----------------------------------------------##
## FIX BADLY FORMATTED FASTA FILES ##
##-----------------------------------------------##
[docs]def fixfasta(sequences):
'''
Takes a list of sequences and tries to correct their
formatting. Designed to be used only in the
the function retrieve_sequences_from_hmmsearch...
Input:
sequences list of sequences, each in one
complete string with \n markers
between identifier line and sequence.
Output:
outsequences list of sequences with hopefully
better formatting.
Errors:
(none) probably a lot... :)
'''
from math import ceil
outsequences = []
for sequence in sequences:
splitstring = sequence.split("\n",1)
number_of_rows = int(ceil(len(splitstring[1]) / 80.0))
seq = []
if number_of_rows > 1:
for row in xrange(1,number_of_rows):
seq.append(splitstring[1][:80] + "\n")
splitstring[1] = splitstring[1][80:]
seq.append(splitstring[1]) # + "\n")
seq.insert(0,splitstring[0] + "\n")
seq = ''.join(seq)
else:
seq = sequence
outsequences.append(seq)
return outsequences
############## END fixfasta
##---------------------------------------------------------------------------##
## CLASSIFY SEQUENCES AS QNR ##
##---------------------------------------------------------------------------##
[docs]def classify_qnr(sequence_length, domain_score, func="", longseqcutoff=75, longseqdef=85):
"""
Classifies a sequence as Qnr or not.
Using the domain_score and a hardcoded (or user defined function)
to classify a given sequence as Qnr or not.
Input:
sequence_length an integer with the sequence length.
domain_score a float with the domain score for this sequence.
func an optional function to determine classification.
longseqcutoff the classification cutoff (minimum score) for long
qnr sequences.
longseqdef the definition for long sequences.
Output:
classification a boolean determining whether it should be classified
as Qnr or not.
Errors:
(none)
"""
# Define a hardcoded function if none given
if func == "":
func = lambda L: 0.57*L + 3
# Pretty self-explanatory. Has a range in which the classification
# function is used, determined by the first if-statement
if (int(sequence_length) >= int(longseqdef)) and (float(domain_score) >= float(longseqcutoff)):
return True
elif int(sequence_length) < 10: # HARDCODED MIN FRAGMENT LENGTH
return False
elif int(sequence_length) < int(longseqdef):
if float(domain_score) > func(float(sequence_length)):
return True
else:
return False
else:
return False
######################### END classify_qnr
##-----------------------------------------------##
## PARSE SEQUENCES POSITIONS FROM HMMSEARCH ##
##-----------------------------------------------##
[docs]def parse_sequence_positions_from_hmmsearch(filepath, seqid_list, min_score=0, func="",
longseqcutoff=75, longseqdef=85):
'''
Retrieves aligned sequence positions from the hmmsearch output file
for domains that can be classified according to the classification function.
Uses classify_qnr implicitly.
Input:
filepath path to hmmsearch output file.
seqid_list list with sequence IDs to retrieve.
min_score the minimum domain score, default=0.
func (lambda) function used to classify fragments.
longseqcutoff parameter for the classification function.
longseqdef parameter for the classification function.
Output:
seqsnutts a dictionary with sequenceIDs as keys to list containing
tuples with position information for the domain alignment
hit.
Errors:
PathError raised if the hmmsearch output file can
not be found at the specified location.
ValueError raised if no sequences are found in the
hmmsearch output file.
'''
from os import path
import re
#print ("NOTICE: Can only retrieve sequences from hmmsearch output if "
# "hmmsearch option --notextw was enabled!")
filepath = path.abspath(filepath)
if not(path.exists(filepath)):
raise PathError("ERROR: The path specified from which to parse sequence "
"'coordinates' is not valid.")
seqid_list = list(seqid_list)
file = open(filepath,"r")
sequences = [] #PREALLOC
# Regular expression to match the line containing the scores and coordinates of the alignment of the
# domain to the HMM.
# example line: (the vvv's designate what's interesting)
# vvv vvv vvv
# score bias c-Evalue i-Evalue hmmfrom hmm to alifrom ali to envfrom env to acc
# 1 ! 413.3 4.9 1.6e-125 1e-120 3 217 .] 12 226 .. 9 226 .. 0.99
# ( ) ( ) ( ) ( ) ( )
regex = r'\s+\d+\s.\s+(\d+\.\d).+..\s+(\d+)\s+(\d+)\s+..\s+(\d+)\s+(\d+)\s+..'
regex = re.compile(regex)
# Create a dictionary that will contain key -coordinates for sequence alignments
# from the hmmsearch output. If several coordinates exists (multiple
# domains match) they will be added to the list for the correct
# key (sequence id)
seqsnutts = {}
while file:
line = file.readline()
if line == "":
break # EOF
for sequenceID in seqid_list:
if line.startswith(">>") and (sequenceID in line):
# THE FOLLOWING WAS CORRECT UNTIL I DISCOVERED A SPECIAL
# 'FEATURE' IN HMMER3 THAT MAKES THE NEXT STATEMENT INVALID.
## Step down three lines;
## they contain nothing of interest
#for a in xrange(1,4):
# line = file.readline()
line = file.readline()
# We are currently on lines where there are alignments
alignments = True
while alignments:
hit = re.match(regex,line)
if hit is not None:
domainscore = hit.group(1)
leftpos = hit.group(2)
rightpos = hit.group(3)
# Compute the sequence length, +1 is important!
sequencelength = int(rightpos)-int(leftpos)+1
# CLASSIFY SEQUENCES USING CLASSIFICATION FUNCTION
if classify_qnr(sequencelength, domainscore, func,
longseqcutoff, longseqdef):
try:
seqsnutts[sequenceID].append((hit.group(2),hit.group(3)))
except KeyError:
# This happens when this is a new sequenceID
seqsnutts[sequenceID] = [(hit.group(2),hit.group(3))]
line = file.readline()
# Break when we reach the end of the alignments
# for this sequence ID (an empty line)
if line.startswith("\n"):
alignments = False #break inner loop
#This should never happend
if seqsnutts == {}:
raise ValueError # No sequence was found!
return seqsnutts
############## END parse_sequence_positions_from_hmmsearch
##---------------------------------------------------------------------------##
## EXTEND SEQUENCES FROM HMMSEARCH ##
##---------------------------------------------------------------------------##
[docs]def extend_sequences_from_hmmsearch(hmmfilepath, seqid_list, min_score, dbpath,
extendleft=0, extendright=0, func="",
longseqcutoff=75, longseqdef=85):
'''
Retrieves "a little more" than the matching domain hits from the
hmmsearch output. It also classifies the hits and discards those
that do not meet the criteria. It prepends the fasta headers with
source information according to source file name and folder name.
Input:
hmmfilepath path to hmmsearch output file.
seqid_list a list with sequence IDs to retrieve.
min_score a primitive first classification.
dbpath system path to the source FASTA file/database.
extendleft an integer with the number of amino acids
to retrieve off the lefthand edge of the
hmmsearch alignment.
extendright an integer with the number of amino acids
to retrieve off the righthand edge of the
hmmsearch alignment.
func (lambda) function for use in classify_qnr,
if this is "" then classify_qnr will use a
hardcoded function.
longseqcutoff longseqcutoff for use in classify_qnr.
longseqdef long sequence definition for use in classify_qnr
Output:
sequences a list of sequences retrieved
message a string with error message (if any)
Errors:
'''
from sys import argv, exit
import subprocess, shlex
from os import path
# List of strings with error message if any
errmessages = []
# Make sure the seqid_list is a list
# This will crash the script or something if the input is not
# already a list or a tuple.
seqid_list = list(seqid_list)
# Set the stepleft and stepright parameters,
try:
extendleft = int(extendleft)
except ValueError:
errmessages.append("NOTE: Could not interpret extend parameters, used extendleft=0")
extendleft = 0
try:
extendright = int(extendright)
except ValueError:
errmessages.append("NOTE: Could not interpret extend parameters, used extendright=0")
extendright = 0
# Find the matched domain sequence positions from hmmsearch output
seqpositions = parse_sequence_positions_from_hmmsearch(hmmfilepath,
seqid_list, min_score, func, longseqcutoff, longseqdef)
# Name of the database and source file, to be appended to the sequence identifier
# Parsed out of the path to the source file, assumed to be stored accordingly:
# /[...]/databasename/sourcefile.whatever
# Extract the database name from the source path
dbname = path.abspath(dbpath).split("/")[-2]
# Join the dbname with the source file name
source_name = path.basename(dbpath).split(".")
dbname = "_".join([dbname,source_name[0]])
# Check if the database index file exists, *.cidx
if path.isfile(str(dbpath)+".cidx"):
usecdbyank = True
else:
usecdbyank = False
errmessages.append("NOTE: cannot find "+str(dbpath)+".cidx,\n falling back to retrieving aligned domain from hmmsearch output\n")
#print dbpath #troubleshooting
sequences = []
for sequence in seqpositions.keys():
# For each tuple of positions (i.e. domain) that matched
# against the model, retrieve the requested part from
# the database. At each index position in the array,
# there is a list of tuples with sequence positions.
for domain in seqpositions[sequence]:
# Extract sequences positions
leftpos, rightpos = domain #seqpositions[domain][0]
# Adjust from which positions to retrieve
# sequences from source database. Correct if
# outside bounds
leftpos = int(leftpos)-extendleft
if leftpos < 1:
leftpos = 1
# No need to check for stepping outside sequence length to the right,
# since cdbyank handles this
rightpos = int(rightpos)+extendright
if usecdbyank:
# Create a system call to cdbyank
cdbyank_call = ''.join(["cdbyank ", str(dbpath), ".cidx ", "-R -a ",
"'", str(sequence), " ", str(leftpos), " ", str(rightpos), "'"])
# Parse the arguments for Popen
cdbyank_args = shlex.split(cdbyank_call)
#print cdbyank_call #DEBUG
#print cdbyank_args #DEBUG
# Run cdbyank through Popen and let it communicate stdout to
# stdout variable, it will never contain more than one extended
# retrieved sequence.
stdout, stderr = subprocess.Popen(cdbyank_args,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE).communicate()
# If something went wrong, inform the user and retrieve only the
# aligned part directly from hmmsearch output as fall back procedure!
if stderr:
errmessages.append("ERROR: cdbyank "+stderr+" when trying to retrieve "+sequence+"\n")
# Append the database name and "source"/location to the FASTA
# header of the retrieved extended sequnece before appending
# it the the return sequence list
extended_domain = stdout.split("\n")
extended_domain[0] = ''.join([">",dbname,"_",extended_domain[0][1:]])
extended_domain = '\n'.join(extended_domain)
sequences.append(str(extended_domain))
else:
# Retrieve the sequence in question directly from hmmsearch
# output instead.
seq=[] # Empty this so we only have one sequence in here
try:
# Sequence needs to be a list even if it's only 1 seq vvvvvvvv
seq = retrieve_sequences_from_hmmsearch(hmmfilepath, [sequence],
0, dbpath, func, longseqcutoff, longseqdef)
sequences.append(seq[0]+"\n")
except ValueError:
errmessages.append("ERROR: Sequence "+sequence+" could not be retrieved.\n")
return (sequences, errmessages)
######################### END extend_sequences_from_hmmsearch
##-----------------------------------------------##
## RETRIEVE SEQUENCES FROM HMMSEARCH ##
##-----------------------------------------------##
[docs]def retrieve_sequences_from_hmmsearch(filepath, seqid_list, min_score, dbpath,
func="", longseqcutoff=75, longseqdef=85):
'''
Retrieves aligned sequence parts from the hmmsearch output file
for domains with scores above min_score. This function potentially
requires a lot of memory!
Input:
filepath path to hmmsearch output file.
seqid_list list with sequence IDs to retrieve.
min_score the minimum domain score.
dbpath path do the database where the sequences were
hmmsearched from.
func (lambda) function for use in classify_qnr.
longseqcutoff longseqcutoff for use in classify_qnr.
longseqthresh longseqthresh for use in classify_qnr.
Output:
seqsnutts a list with sequences parsed from the alignment.
Errors:
PathError raised if the hmmsearch output file can
not be found at the specified location.
ValueError raised if no sequences are found in the
hmmsearch output file.
'''
from os import path
from string import maketrans
import re
# TODO: Make function check if --notextw was enabled
#print ("NOTICE: Can only retrieve sequences from hmmsearch output if "
# "hmmsearch option --notextw was enabled!")
seqid_list = list(seqid_list)
filepath = path.abspath(filepath)
if not(path.exists(filepath)):
raise PathError("ERROR: The path specified from which to parse sequence "
"'coordinates' is not valid.")
# Regular expression to match the score of the aligned domain
regex_scores = r'\s+== domain\s+\d+\s+score:\s+(\d+\.\d)'
regex_scores = re.compile(regex_scores)
# Create a list that will hold the retrieved sequences with their
# identifiers.
sequences = []
# Name of the database and source file, to be appended to the sequence identifier
# Parsed out of the path to the source file, assumed to be stored accordingly:
# /[...]/databasename/sourcefile.whatever
# Extract the database name from the source path
dbname = path.abspath(dbpath).split("/")[-2]
# Join the dbname with the source file name
source_name = path.basename(dbpath).split(".")
dbname = "_".join([dbname,source_name[0]])
file = open(filepath,"r")
while file:
line = file.readline()
if line == "":
break # EOF
for sequenceID in seqid_list:
if line.startswith(">>") and (sequenceID in line):
# Append the database name to the sequence identifier
sequence_identifier_line = ''.join([">",dbname,"_",line[3:]])
# Step down five lines;
# they contain nothing of interest
for a in xrange(0,5):
line = file.readline()
# We are currently on lines where there might be alignments
alignments = True
while alignments:
scorematch = re.match(regex_scores,line)
if scorematch is not None:
current_domain_score = scorematch.group(1)
if float(current_domain_score) > float(min_score):
# Read two more uninteresting lines with only
# the aligned part of the sequence to the HMM
file.readline() #Contains nothing of interest
file.readline() #Contains nothing of interest
line = file.readline() # this is the line we want!
regex_domain = r'\s+([\w\.|\_-]+)\s+\d+\s([\w\*-]+)\s+\d+[\s\n]+'
domainhit = re.search(regex_domain,line)
if domainhit is not None:
# Use translate to remove dashes
# and change to uppercase
tr = maketrans("","")
domain = domainhit.group(2).upper().translate(tr,"-*")
# PERFORM CLASSIFICATION OF DOMAIN HIT
classification = classify_qnr(sequence_length=len(domain),
domain_score=current_domain_score,
func=func,
longseqcutoff=longseqcutoff,
longseqdef=longseqdef)
# If the current domain sequence is classified as a potential
# true hit, append it to the list of sequences
if classification:
sequences.append(''.join([sequence_identifier_line, domain]))
line = file.readline()
# Break when we reach the end of the alignments
# for this sequence ID
if line.startswith(">>"):
alignments = False #break inner while loop
elif line == "":
break # EOF, breaks while loop
#This should never happend
if not sequences:
raise ValueError # No sequence was found!
# Format the sequences so that they conform
# better to FASTA "standard"
sequences = fixfasta(sequences)
return sequences
############## END retrieve_sequences_from_hmmsearch
##---------------------------------------------------------------------------##
## RETRIEVE SEQUENCES FROM DATABASE ##
##---------------------------------------------------------------------------##
[docs]def retrieve_sequences_from_db(dbpath, seqid_list, domain_scores, retr_seq_filepath,
func="", longseqcutoff=75, longseqdef=85):
'''
Retrieves complete source sequences to hits in hmmsearch from their
source database files. It also classifies the hits and discards those
that do not meet the criteria. It prepends the fasta headers with
source information.
Input:
dbpath path to database FASTA file.
seqid_list a list with sequence IDs to retrieve.
domain_scores a list with domain scores for the sequences IDs.
retr_seq_filepath path to output file.
func (lambda) function for use in classify_qnr,
if this is "" then classify_qnr will use a
hardcoded function.
longseqcutoff longseqcutoff for use in classify_qnr.
longseqdef long sequence definition for use in classify_qnr
Output:
sequences a list with sequences
errmessages a list with error messages (if any)
Errors:
'''
from sys import argv, exit
import subprocess, shlex
from os import path
# List of strings with error message if any
errmessages = []
# Make sure the seqid_list is a list
seqid_list = list(seqid_list)
# Name of the database and source file, to be appended to the sequence identifier
# Parsed out of the path to the source file, assumed to be stored accordingly:
# /[...]/databasename/sourcefile.whatever
# Extract the database name from the source path
dbname = path.abspath(dbpath).split("/")[-2]
# Join the dbname with the source file name
source_name = path.basename(dbpath).split(".")
dbname = "_".join([dbname,source_name[0]])
# Make sure there is a cdbfasta index file, *.cidx:
if path.isfile(str(dbpath)+".cidx"):
usecdbyank = True
else:
usecdbyank = False
errmessages.append("ERROR: No index file "+str(dbpath)+".cidx,\n Skipping\n")
sequences = []
for sequence in seqid_list:
if usecdbyank:
# Create a system call to cdbyank
cdbyank_call = ''.join(["cdbyank ", str(dbpath), ".cidx ", "-a",
"'", str(sequence), "'"])
# Parse the arguments for Popen
cdbyank_args = shlex.split(cdbyank_call)
#print cdbyank_call #DEBUG
#print cdbyank_args #DEBUG
# Run cdbyank through Popen and let it communicate stdout to
# stdout variable, it will never contain more than one retrieved
# source sequence (though this could potentially be a VERY
# large genome, and might crash the program if too large).
stdout, stderr = subprocess.Popen(cdbyank_args,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE).communicate()
# If something went wrong, make possible to inform the user later!
if stderr:
errmessages.append("ERROR: cdbyank "+stderr+" trying to retrieve "+sequence+"\n")
continue
# Append the database file name and "source"/location to the FASTA
# header before appending it the the return sequence list
extended_domain = stdout.split("\n")
extended_domain[0] = ''.join([">",dbname,"_",extended_domain[0][1:]])
extended_domain = '\n'.join(extended_domain)
sequences.append(str(extended_domain))
return (sequences, errmessages)
######################### END retrieve_sequences_from_db
##-----------------------------------------------##
## UNIQUE:IFY SEQUENCE IDS ##
##-----------------------------------------------##
[docs]def uniqueify_seqids(sequences,filename):
'''
Unique:ify sequences identifiers in fasta formatted
sequences by adding an integer right after the sequence
identifer at the first space.
Input:
sequences filename with sequences in fasta format
to unique:ify
filename the output filename for unique sequences
Output:
(None) The list of strings in fasta format with all
sequences inputted sequences, now with
added [integer] right after the sequence
identifier, is written out
to a file for further usage in the OS
environment (i.e. in formatdb)
Errors:
ValueError raised if the input filenames are not valid
PathError raised if the input filename path is invalid
'''
from os import path
import re
if not(isinstance(sequences,str)):
raise ValueError("Input sequences filename to uniqueify_seqids was not a string!")
if not(isinstance(filename,str)):
raise ValueError("Output filename to uniqueify_seqids was not a string!")
# Open file for reading
try:
infile = open(path.abspath(sequences),'r')
except OSError:
raise PathError("ERROR: Path to file with sequence IDs to unique:ify is invalid!")
try:
outfile = open(path.abspath(filename),'w')
except OSError:
raise PathError("ERROR: The output filename for unique sequences could not be opened")
# Compile a regular expression to match the sequence identifier string and
# put it in two groups, first with sequence id and second with the
# rest of the sequence identifier line. The unique number will be inserted
# between the two.
ss = re.compile(r'^>([\S\.\-\_]+)(.*\n)') # Removed a \s in (\s.*\n) at the end 201008xx
number = 1
for line in infile:
if line.startswith(">"):
hit = re.match(ss,line)
if hit is not None:
# Append the number and attach the rest of the line
# and write to outfile
outfile.write(''.join([">",hit.group(1),"--",str(number),hit.group(2)]))
number = number + 1
else:
# This would be the entire sequence on following lines until next ">"
outfile.write(line)
return
############## END uniqueify_seqids
##-----------------------------------------------##
## RUN BLASTCLUST ##
##-----------------------------------------------##
[docs]def run_blastclust(filename,PercentIdentity=90,CovThreshold=50,numcores=4):
'''
Run formatdb to create a BLAST database, then run
blastclust on that database to cluster all hits.
Input:
filename filename with sequences with unique ids to cluster.
PercentIdentity the percent identity to cluster with, default
is 90 %.
CovThreshold the minimum length coverage threshold for
clustering, default is 50 %
(numcores) optional argument specifying the number
of cores to run blastclust on, default is
4 and 0 means all available.
Output:
(None) Writes output to a file, 'filename.clusters'
that contains all identified clusters on
each row.
Errors:
PathError raied if there is something wrong with the paths
to output or input files.
ValueError raised if there is something wrong
'''
from os import path
import shlex, subprocess
if not(path.exists(path.abspath(filename))):
raise PathError("ERROR: The path to unique sequence id hit file is incorrect")
# Create string for calling the subprocess 'formatdb'
formatdb_call = ''.join(["formatdb -t SignificantHits_UniqueSeqIDs -i ",filename])
try:
formatdb = shlex.split(formatdb_call)
subprocess.call(formatdb)
return_text = "Created BLAST database with unique sequence IDs"
except OSError:
return_text = "The formatdb command could not be run:", formatdb_call
# Run blastclust in a similar way
if not isinstance(CovThreshold,str):
CovThreshold = str(CovThreshold)
if not isinstance(PercentIdentity,str):
PercentIdentity = str(PercentIdentity)
if not isinstance(numcores,str):
numcores = str(numcores)
blastclust_call = ''.join(["blastclust -d ",path.abspath(filename),
" -S ",PercentIdentity,
" -a ",numcores,
" -L ",CovThreshold,
" -o ",filename,".clusters"])
try:
blastclust = shlex.split(blastclust_call)
blastclust_output = subprocess.Popen(blastclust,\
stdout=subprocess.PIPE,\
stderr=subprocess.PIPE).communicate()
except OSError:
returnstring = ''.join(["ERROR: Blastclust could not be run. Is it properly installed?\n",
"The following call was made: $", blastclust_call])
return ("error",returnstring)
return (return_text,blastclust_output)
############## END run_blastclust
##-----------------------------------------------##
## PARSE OUTPUT FROM BLASTCLUST ##
##-----------------------------------------------##
[docs]def parse_blastclust(filename):
'''
Parses blastclust output into a nested list structure
Input:
filename filename of blastclust output
Output:
sequenceIDs list of sequence IDs with unique identifiers
right after the '>' symbol
Errors:
PathError raised if the file does not exists
ValueError rasied if no unique identifiers could be found
and removed
'''
from os import path
if not(path.isfile(path.abspath(filename))):
raise PathError("ERROR: The blastclust output file could not be opened")
try:
filepath = path.abspath(filename)
file = open(filepath,'r')
except OSError:
print "ERROR: Could not open", filepath
exit(1)
sequenceIDs = []
for line in file:
sequenceIDs.append(line.rstrip("\n ").split(" "))
if sequenceIDs == []:
raise ValueError
return sequenceIDs
############## END parse_blastclust
##-----------------------------------------------##
## DE-UNIQUEIFY SEQUENCE IDS ##
##-----------------------------------------------##
[docs]def deuniqueify_seqids(sequenceIDs,readfile=False):
'''
Removes unique identifiers appended to sequence IDs in
fasta format by 'uniqueify_seqids'.
Second option is to read a file and de-unique:ify the
sequence identifiers found in that file;
the file formats understood are regular fasta and
clustalw output.
Input:
sequenceIDs list of sequence IDs (parsed from blastclust
output)
readfile boolean determining whether sequenceIDs contains
a filename to read and correct rather than
a list of sequence IDs parsed from blastclust.
Output:
sequenceIDs list of sequence IDs without unique identifiers
right after the '>' symbol
None if readfile was true nothing is return, instead
changes are written directly to file.
Errors:
ValueError rasied if no unique identifiers could be found
and removed
PathError raised if there was some error regarding
the file to be read.
'''
from os import path
import re
if readfile:
try:
filein = open(path.abspath(sequenceIDs),'r')
fileout = open(path.abspath(sequenceIDs+".restored"),'w')
except OSError:
raise PathError("Could not open multiple alignment file to de-uniqueify!")
line = filein.readline()
# FASTA FORMAT
if line.startswith(">"):
print "FASTA format detected"
regex = r'^(>[\S\.-_]+)--\d+(\s.*\n)'
regex = re.compile(regex)
hit = re.match(regex,line)
if hit is not None:
fileout.write(''.join([hit.group(1), hit.group(2)]))
else:
print "Parse error of first line!!"
return
while filein:
line = filein.readline()
if line == "":
break
hit = re.match(regex,line)
if hit is not None:
fileout.write(''.join([hit.group(1), hit.group(2)]))
else:
fileout.write(line)
# CLUSTAL FORMAT
elif line.startswith("CLUSTAL"):
print "ClustalW format detected"
regex = r'([\S\-_]+)(--\d+)(\s+.*\n)'
regex = re.compile(regex)
fileout.write(line)
spacesize = 0
while filein:
line = filein.readline()
if line == "":
break
hit = re.match(regex,line)
if hit is not None:
fileout.write(''.join([hit.group(1), hit.group(3)]))
spacesize = len(hit.group(2))
else:
fileout.write(line[spacesize:])
# UNKNOWN FORMAT; DIE
else:
print "No known format detected; accepts only CLUSTAL or fasta"
return
# NOT READING FROM FILE BUT FROM LIST OF SEQUENCES
else:
clean_sequenceIDs = []
for cluster in sequenceIDs:
temp_cluster = [] # Temporary storage of cluster members
for seqID in cluster:
# Match the number at the end and remove it
pattern = r'([\S\-_]+)--\d+'
found = re.match(pattern,seqID)
if found is not(None):
temp_cluster.append(found.group(1))
clean_sequenceIDs.append(temp_cluster)
if clean_sequenceIDs == []:
raise ValueError
return clean_sequenceIDs
############## END deuniqueify_seqids
##-----------------------------------------------##
## MULTIPLE ALIGNMENT WITHIN AND BETWEEN CLUSTERS##
##-----------------------------------------------##
[docs]def malign_clusters(clusters,resdir,refseqpath,workdir):
'''
Reads clusters and from file reads complete sequences
for the sequence IDs specified in the clusters.
Note that it reads sequences from the filename
'workdir'/retrieved_sequences.fasta.
The function assumes that MAFFT is installed and
accessible through the PATH variable or in the current
directory. If refseqpath is empty the function does
not align against any reference sequences. Has hardcoded
the names of the five plasmid mediated Qnr-genes on line
1029.
Input:
clusters nested list stucture with clusters.
resdir directory to output results.
refseqpath path to file with reference qnr sequence
in fasta format. Can be an empty string if
no reference sequences are to be aligned
against.
workdir working directory with temporary files
(unique_retrieved_sequences.fasta.shortened).
Output:
(none) Writes multiple alignments to file:
resdir/cluster*.aligned
Errors:
PathError raised if retrieved_sequences.fasta
could not be opened/found.
'''
from os import path, system
import re
import shlex, subprocess
# USER EDITABLE LIST WITH REFERENCE GENES
references = ["QnrA","QnrB","QnrC","QnrD","QnrS"] # Can be modified!
# Read the file with sequences, fixed filename
# Each element in the list is a sequence
sequences = []
tempseqid = ""
tempseq = ""
try:
seqfile = open(''.join([workdir,"unique_retrieved_sequences.fasta.shortened"]),'r')
for line in seqfile:
if line.startswith(">"):
sequences.append(''.join([tempseqid,tempseq]))
tempseqid = line
tempseq = ""
elif not line.startswith(">"):
tempseq = ''.join([tempseq,line])
sequences.append(''.join([tempseqid,tempseq]))
except OSError:
raise PathError(''.join(["Could not open/find ",workdir,"'unique_retrieved_sequences.fasta'"]))
## Not using clustalw any more, the log-file business has been commented out on purpose!
# Perform multiple alignment between members of clusters and write log
# of clustalw stuff to 'workdir'/clustalw.log
#clustalwlogfile = open(path.abspath(''.join([workdir,'clustalw.log'])),'w')
clusterno = 0
for cluster in clusters:
# Perform multiple alignment within cluster if there are
# more than one members in cluster
if len(cluster) > 1:
# Create file and open it for writing with proper name
clusterno = clusterno + 1
malignfilepath = path.abspath(''.join([resdir,"cluster",
str(clusterno)]))
malignfile = open(malignfilepath,"w")
# Find sequences for all sequence IDs in cluster and write to file
for seqid in cluster:
for sequence in sequences:
if seqid in sequence:
malignfile.write(sequence)
malignfile.close()
# Set parameters etc for clustalw and make alignment
infilepath = malignfilepath
outfilepath = ''.join([malignfilepath,".aligned"])
mafft_call = ''.join(["mafft --quiet ",infilepath," > ",outfilepath])
errors = system(mafft_call)
if errors != 0:
return 2 # Error code for mafft error
# The following code has been commented out because the
# usage of mafft is preferred over clustalw. It can be
# reused if one prefers clustalw.
# It does produce worse alignments, especially over sequences
# of very different lengths...
#clustalw_call = ''.join(["clustalw -INFILE=",infilepath,
# " -OUTFILE=",outfilepath])
#args = shlex.split(mafft_call)
#subprocess.call(args)
#clustalwlog = subprocess.Popen(args,stdout=subprocess.PIPE,
# stderr=subprocess.PIPE).communicate()
#if not len(clustalwlog[0]) == 0:
# clustalwlogfile.write(clustalwlog[0])
if clusterno == 0:
#print "\n!! --- No clusters contained more than one member (only singletons)\n"
return 1 # No need to stay here, lets go home to our box on the highway
# If refseqpath is empty just skip over
# this alignment against reference sequences.
if refseqpath is not "":
# Read the reference sequences from the path specified
# Each element in the list is a complete sequence
refsequences = []
tempseqid = ""
tempseq = ""
try:
seqfile = open(refseqpath,'r')
for line in seqfile:
if line.startswith(">"):
refsequences.append(''.join([tempseqid,tempseq]))
tempseqid = line
tempseq = ""
elif not line.startswith(">"):
tempseq = ''.join([tempseq,line])
refsequences.append(''.join([tempseqid,tempseq]))
except OSError:
raise PathError(''.join(["ERROR: cannot open", refseqpath]))
# Perform multiple alignment against all reference plasmid mediated qnr-genes
# for each cluster
clusterno = 0
for cluster in clusters:
if len(cluster) > 1:
clusterno = clusterno + 1
for reference in references:
malignfilepath = path.abspath(''.join([resdir,"ref",reference,
".cluster",str(clusterno)]))
malignfile = open(malignfilepath,"w")
# Find sequences for all sequence IDs in cluster and write to file
for seqid in cluster:
for sequence in sequences:
if seqid in sequence:
malignfile.write(sequence)
# Also write the reference gene to the same file
for refseq in refsequences:
if reference in refseq:
malignfile.write(refseq)
malignfile.close()
# Set parameters etc for clustalw and make alignment
infilepath = malignfilepath
outfilepath = ''.join([malignfilepath,".aligned"])
mafft_call = ''.join(["mafft --quiet ",infilepath," > ",outfilepath])
errors = system(mafft_call)
if errors != 0:
return 2 # Error code for mafft error
# The following code has been commented out because the
# usage of mafft is preferred over clustalw. It can be
# reused if one prefers clustalw. Using clustalw is
# 'safer' in a system sense, but gives less good alignments...
#clustalw_call = ''.join(["clustalw -INFILE=",infilepath,
# " -OUTFILE=",outfilepath])
#args = shlex.split(mafft_call)
#clustalwlog = subprocess.Popen(args,stdout=subprocess.PIPE,
# stderr=subprocess.PIPE).communicate()
#if not len(clustalwlog[0]) == 0:
# clustalwlogfile.write(clustalwlog[0])
return
############## END malign_clusters
##-----------------------------------------------##
## LIMIT SEQUENCE LENGTH ##
##-----------------------------------------------##
[docs]def limit_sequence_length(sequencefile,MAX_LINES=64):
'''
Takes a sequence file and shortens all sequences
it to the MAX_LINES supplied. It is divided by
inserting the sequence ID header ">seqid...."
after the maximum sequence length position, thus
creating several smaller sequence segments with
the same sequence ID and header line.
Recommended MAX_LINES is 64 (64 lines of 80
amino acid residues = 5120).
As the function splits at "\n" characters it
requires the fasta file to keep sequences on
several lines - fasta files with sequences on
a single line will NOT work.
Input:
sequencefile filename string.
MAX_LINES maximum number of lines for one
sequence, defaults to 64.
Output:
(none) Writes directly to disk to
'sequencefile.shortened'.
Errors:
PathError raised if there is something wrong with path
'''
from os import path
if not path.isfile(path.abspath(sequencefile)):
raise PathError(''.join(["ERROR: Path ",sequencefile," is not a valid file!"]))
try:
file = open(path.abspath(sequencefile),'r')
outfile = open(path.abspath(sequencefile+'.shortened'),'w')
except OSError:
raise PathError("Could not open sequence and/or output file!")
# Reads the ENTIRE file into memory - it is assumed
# that the number of sequences retrieved does not
# require more memory than what can fit into memory.
entire_file = file.read()
# Splits at all sequences but the first (it does not start with a newline)
# This method introduces an empty line between the first and following
# sequences if it is split up, assumed non dangerous but still noted.
sequences = entire_file.split("\n>")
first = True # To give special treatment to first sequence
seqout = []
for sequence in sequences:
if not sequence == "":
if first:
# Already has initial ">"
lines = sequence.splitlines(True)
seqout = []
counter = 0
else:
# Add the'\n>' that was removed
sequence = ''.join(["\n>",sequence])
lines = sequence.splitlines(True)
seqout = []
counter = 0 # Reset, starting with new sequence
for line in lines:
# Add lines of sequence until MAX_LINES,
# then add sequence identifier again and
# continue until end of sequence
if counter > MAX_LINES:
if first:
if not(lines[0].endswith("\n")):
seqout.append(''.join([lines[0],"\n"]))
else:
seqout.append(lines[0])
else:
# The following sequences
# got split at the initial '\n'
# an the identifier is thus in
# element 1 instead of 0
if not(lines[1].endswith("\n")):
seqout.append(''.join([lines[1],"\n"]))
else:
seqout.append(lines[1])
if not(line.endswith("\n")):
seqout.append(''.join([line,"\n"]))
else:
seqout.append(line)
counter = 0
else:
seqout.append(line)
counter = counter + 1
#seqout.append("") # Probably not needed...
first = False # Now the first has been processed
outfile.write(''.join(seqout))
return
############## END limit_sequence_length
##-----------------------------------------------##
## RETRIEVE SEQUENCES FROM FASTA ##
##-----------------------------------------------##
[docs]def retrieve_sequences_from_fasta(fastapath,seqid_list):
'''
Retrieves sequence(s) from a fasta file. Python
implementation so it is a bit slow, not intended
for retrieving sequences from large databases.
Input:
fastapath string with path to fasta file
seqid_list list with sequence IDs to retrieve
Output:
sequences list of strings in fasta format with all
sequences to retrieve, containing
linebreaks after sequence ID headers.
Errors:
PathError raised if the supplied path does not
exists or something related to that.
ValueError raised if no sequences are found in the
database.
'''
from os import path
fastapath = path.abspath(fastapath)
if not(path.isfile(fastapath)):
raise PathError("ERROR: Path to fasta file is incorrect.")
seqid_list = list(seqid_list)
db = open(fastapath,"r")
sequences = [] #PREALLOC
#print "Trying to retrieve",len(seqid_list),"sequences from fasta file:", fastapath
seq_counter = 0
line = db.readline()
while db:
if line.startswith(">"): #Reached sequence descriptor line
for id in seqid_list:
if id in line:
seq_counter = seq_counter + 1
sequencestring = line
seq = True
while seq:
line = db.readline()
if not(line.startswith(">")):
sequencestring = ''.join([sequencestring,line])
elif line.startswith(">"):
seq = False
sequences.append(sequencestring)
else:
line = db.readline()
if seqid_list == []:
break
elif line == "": # EOF
db = False
else:
line = db.readline()
if sequences == []:
raise ValueError # No sequence was found!
#print "Retrieved",seq_counter,"sequences from file"
return sequences
############## END retrieve_sequences_from_fasta
##-----------------------------------------------##
## CLEANUP ##
##-----------------------------------------------##
[docs]def cleanup(tmpdir,exitcode=0):
'''
Cleans up (moves) error.log and formatdb.log to
specified tmp directory
Input:
tmpdir name of the temporary directory to which to
move the two log files.
exitcode the exitcode to give the shell, defaults to 0.
Output:
(none)
Errors:
(none) Will print error messages to stderr if
the files does not exist or could not
be moved.
'''
import shlex, subprocess
from sys import exit
args = shlex.split("mv error.log formatdb.log "+tmpdir)
out = subprocess.Popen(args, stdout=subprocess.PIPE,
stderr=subprocess.PIPE).communicate()
if out[1] != "":
pass
##-----------------------------------------------##
## CUSTOM EXCEPTIONS ##
##-----------------------------------------------##
# fluff.PathError General error with path
class PathError(Exception):
def __init__(self,message):
self.message = message
def __str__(self):
return repr(self.message)
# fluff.ParseError General error when parsing
class ParseError(Exception):
def __init__(self,message):
self.message = message
def __str__(self):
return repr(self.message)
###############################################################################
## HERE FOLLOWS DEPRECATED AND OLD, ##
## BADLY PERFORMING CODE FOR FUTURE REFERENCE ##
###############################################################################
###-----------------------------------------------##
### RETRIEVE SEQUENCES FROM DB ## OLD SLOW PYTHON
###-----------------------------------------------##
#def retrieve_sequences_from_db_old(dbpath,seqid_list):
# '''
# Retrieves sequence(s) from a sequence database
# Input:
# dbpath string with path to database
# seqid_list list with sequence IDs to retrieve
# Output:
# sequences list of strings in fasta format with all
# sequences to retrieve, containing
# linebreaks after sequence ID headers.
# Errors:
# PathError raised if the supplied path does not
# exists or something related to that.
# ValueError raised if no sequences are found in the
# database.
# '''
#
# from os import path
#
# dbpath = path.abspath(dbpath)
# if not(path.isfile(dbpath)):
# raise PathError("ERROR: Path specified in the hmmsearch output is not valid.")
#
#
# seqid_list = list(seqid_list)
# db = open(dbpath,"r")
# sequences = [] #PREALLOC
#
# print "Trying to retrieve",len(seqid_list),"sequences from database:", dbpath
# seq_counter = 0
# while db:
# line = db.readline()
# if line.startswith(">"): #Reached sequence descriptor line
# for id in seqid_list:
# if id in line:
# seqid_list.remove(id)
# seq_counter = seq_counter + 1
# sequencestring = line
# seq = True
# while seq:
# line = db.readline() #.rstrip("\n")
# if not(line.startswith(">")):
# sequencestring = ''.join([sequencestring,line])
# elif line.startswith(">"):
# seq = False
# sequences.append(sequencestring)
#
# if seqid_list == []:
# break
#
# elif line == "": # EOF
# db = False
#
# if sequences == []:
# raise ValueError # No sequence was found!
#
# print "Retrieved",seq_counter,"sequences from database"
# return sequences
############### END retrieve_sequences_from_db_old
###-----------------------------------------------##
### RETRIEVE SEQUENCES FROM DB ##
###-----------------------------------------------##
#def retrieve_sequences_from_db(dbpath,seqid_list,outfile):
# '''
# Retrieves sequence(s) from a sequence database. Requires
# the database to have an indexfile in the same folder
# called database.index.
# THIS SHOULD BE REMADE TO MAKE USE OF CDBFASTA
#
# Input:
# dbpath string with path to database
# seqid_list list with sequence IDs to retrieve
# outfile argument with path to output file,
# Output:
# (none) Writes output directly to file
# specified in argument 'outfile'.
# Errors:
# PathError raised if the supplied path does not
# exists or something related to that.
# '''
#
# from os import path
# import shlex, subprocess
#
# dbpath = path.abspath(dbpath)
# if not(path.isfile(dbpath)):
# raise PathError("ERROR: Path specified in the hmmsearch output is not valid.")
#
#
# seqid_list = list(seqid_list)
#
# print "Trying to retrieve",len(seqid_list),"sequences from database:", dbpath
# count = 0
# for seqid in seqid_list:
# retrseq_call = ''.join(['retrseqdb.sh ',seqid,' ',dbpath,' ',outfile])
# retrseqdb = shlex.split(retrseq_call)
# try:
# output = subprocess.Popen(retrseqdb,stdout=subprocess.PIPE,stderr=subprocess.PIPE).communicate()
# if 'No such file or directory' in output[1]:
# pass #There is probably no index file for the database
# elif not len(output[1]) == 0:
# count = count + 1
# except OSError:
# print "Could not find", retrseqdb[0]
#
# if count == 0:
# print ("Could not retrieve any sequences from database: "+dbpath+
# " ,is it missing an index-file?")
# else:
# print "Retrieved",count,"sequences from database"
# return
############### END retrieve_sequences_from_db