Source code for pyphylogenomics.MUSCLE

'''
======
MUSCLE
======

Reads in sequences from files, group homologous sequences based on gene IDs and aligns them.
'''

from Bio import SeqIO
from Bio import AlignIO
from Bio.Seq import Seq
from Bio.Alphabet import generic_dna
from Bio.SeqRecord import SeqRecord
from Bio.Align.Applications import MuscleCommandline
import subprocess
import sys
import time
import os
import string
import re
import requests
import glob


[docs]def batchAlignment(files):
    '''
    Reads in sequences from files, group homologous sequences and aligns them.

    ``files`` a list of file names. The first file in the list should
    be the master file (e.g. exon sequences of *Bombyx mori*).

    Example:

    >>> from pyphylogenomics import MUSCLE
    >>> files = ['Bmori_exons.fasta', 'Danaus_exons.fasta','Heliconius_exons.fasta','Manduca_exons.fasta']
    >>> MUSCLE.batchAlignment(files)

    All aligned sequences will be written into a folder called ``alignments`` as
    FASTA files (one file per exon).
    '''

    # create folder for saving alignments
    folder = "alignments"
    if not os.path.exists(folder):
        os.makedirs(folder)

    for idx, f in enumerate(files):  # Read each file into a dictionary.
        exec("handle%d = %s" % (idx + 1, "SeqIO.index(f, 'fasta')"))

    count = 0  # To count the number of homologous groups.
    print("If you are using Windows...")
    time.sleep(1)
    print("many command line shells might pop up now.")
    time.sleep(3)

    for ID1 in handle1:  # Parsing the master file.
        seqs_list = []
        seqs_list.append(SeqRecord(handle1[ID1].seq, id=ID1))
        print("Pooling gene %s" % ID1)

        idx = len(files)
        while idx > 1:  # Parsing the other files.
            script = '''for ID2 in handle%d:
    if ID1.split(':')[0] == re.sub(":.+$", "", ID2.split('-')[0]):
        seqs_list.append(SeqRecord(handle%d[ID2].seq, id = ID2))'''
            exec(script % (idx, idx))
            idx -= 1

        # TODO: Add a command for PSAs, for cases where the homologous group
        # has only 2 sequences.

        if len(seqs_list) > 2:  # Now perfom the MSA.
            count += 1

            path = os.path.join(folder, ID1.split(':')[0] + ".fasta")
            muscle_cline = MuscleCommandline(out=path, fasta=True)

            child = subprocess.Popen(str(muscle_cline),
                                     stdin=subprocess.PIPE,
                                     stderr=subprocess.PIPE,
                                     shell=(sys.platform != "win32"))  # In case the user's machine is not Windows.

            SeqIO.write(seqs_list, child.stdin, "fasta")
            child.stdin.close()

            time.sleep(3)  # Wait 3 seconds to not fill off the screen with CMDs.

    print("%d alignments have been saved in the folder \"%s\"" % (count, folder))


[docs]def bluntSplicer(folder_path, window=20):
    '''
    Splices Multiple Sequence Alignments objects found in *folder_path*.
    Objects should be in FASTA format. Gaps from both flanks of the alignments
    will be trimmed.

    Window size is used to find gaps in flanks (default 20 nucleotides).
    **bluntSplicer** reads the MSA files from *folder_path*, and stores the spliced MSAs
    in the same folder.

    Example:

    >>> from pyphylogenomics import MUSCLE
    >>> MUSCLE.bluntSplicer("alignments/") # folder_path containing the FASTA file alignments
    '''

    folder_path = folder_path.strip()
    folder_path = re.sub("/$", "", folder_path)

    # MUSCLE.batchAlignment.py created a alignments folder_path.
    for path in glob.glob(os.path.join(folder_path, "*.fasta")):
        alignment = AlignIO.read(path, "fasta")
        print("\nSplicing %s file" % path.split("\\")[-1])
        for i in range(0, alignment.get_alignment_length()):  # For checking gaps on the left flank.
            gap = False
            for record in alignment[:, i:i + window]:
                if "-" in record.seq:
                    gap = True
                    break
                else:
                    continue
            if gap is False:
                start = i
                break

        for i in range(alignment.get_alignment_length(), 0, -1):  # For checking gaps on the right flank.
            gap = False
            for record in alignment[:, i - window:i]:
                if "-" in record.seq:
                    gap = True
                    break
                else:
                    continue
            if gap is False:
                end = i
                break

        out = path.split(".fasta")[0] + "_bluntlySpliced.fasta"  # Saving edited MSAs in same folder.
        AlignIO.write(alignment[:, start:end], out, "fasta")


[docs]def designPrimers(folder, tm="55", min_amplength="100", max_amplength="500", gencode="universal", mode="primers", clustype="dna", amptype="dna_GTRG", email=""):
    '''
    It will send a FASTA alignment to `primers4clades`_ in order to design degenerate primers. Input data needed:

    * Alignment in FASTA format containing at least 4 sequences.
    * Several parameters:

        * temperature
        * minimium amplicon length
        * maximum amplicon length
        * genetic code
        * cluster type
        * substitution model
        * email address

   Example:
   The values shown are the default. Change them if needed.

    >>> from pyphylogenomics import MUSCLE

    >>> folder = "alignments"   # folder containing the FASTA file alignments
    >>> tm = "55"               # annealing temperature
    >>> min_amplength = "250"   # minimium amplicon length
    >>> max_amplength = "500"   # maximum amplicon length
    >>> gencode = "universal"   # see below for all available genetic codes
    >>> mode  = "primers"
    >>> clustype = "dna"
    >>> amptype = "dna_GTRG"    # substitution model used to estimate phylogenetic information
    >>> email = "youremail@email.com"   # primer4clades will send you an email with very detailed results

    >>> MUSCLE.designPrimers(folder, tm, min_amplength, max_amplength, gencode, mode, clustype, amptype, email)

    The best primer pairs will be printed to your screen. Detailed results will
    be saved as HTML files in your alignments folder. But it is recommended if
    you also get the results by email. primers4clades_ will send you one email
    for each alignment.

    The genetic code table (variable ``gencode``) can be any of the following:

    * ``universal`` for standard
    * ``2`` for vertebrate mitochondrial
    * ``3`` for yeast mitochondrial
    * ``4`` for mold and protozoa mitochondrial
    * ``5`` for invertebrate mitochondrial
    * ``6`` for ciliate
    * ``9`` for echinoderm and flatworm
    * ``10`` for  euplotid nuclear
    * ``11`` for  bacterial and plastid
    * ``12`` for  alternative yeast nuclear
    * ``13`` for  ascidian mitochondrial
    * ``14`` for  flatworm mitochondrial
    * ``15`` for  Blepharisma nuclear
    * ``16`` for  Chlorophycean mitochondrial
    * ``21`` for  Trematode mitochondrial
    * ``22`` for  Scenedesmus obliquus mitochondrial
    * ``23`` for  Thraustochytrium mitochondrial

    The evolutionary substitution model can be any of the following (variable
    ``amptype``):

    * ``protein_WAGG``  for protein WAG+G
    * ``protein_JTTG``  for protein JTT+G
    * ``protein_Blosum62G``  for protein Blosum62+G
    * ``protein_VTG``  for protein VT+G
    * ``protein_DayhoffG``  for protein Dayhoff+G
    * ``protein_MtREVG``  for protein MtREV+G
    * ``dna_HKYG``  for dna HKY+G
    * ``dna_GTRG``  for dna GTR+G
    * ``dna_K80G``  for dna K80+G
    * ``dna_TrNG``  for dna TrN+G
    * ``dna_JC69G``  for dna JC69+G

    .. _primers4clades: http://floresta.eead.csic.es/primers4clades/#0
    '''

    if os.path.exists(folder):
        # are there alignments/files in that folder?
        all_files = os.path.join(folder, "*")
        alns = glob.glob(all_files)

        if len(alns) > 0:
            url = "http://floresta.eead.csic.es/primers4clades/primers4clades.cgi"
            params = {'tm': tm, 'min_amplength': min_amplength,
                      'max_amplength': max_amplength, 'mode': mode, 'gencode':
                      gencode, 'clustype': clustype, 'email': email,
                      }

            primers = []
            for aln in alns:
                # match only files ending in .FAS[TA]
                if re.search("fas[ta]*$", aln, re.I):
                    print("\nProcessing file \"%s\"" % aln)
                    files = {'sequencefile': open(aln, 'rb')}
                    r = requests.post(url, files=files, data=params)

                    this_file = os.path.split(aln)[1]
                    this_file = re.sub(".fas.*", "", this_file)

                    # Save result to file
                    to_print = "Writing detailed results as file \""
                    to_print += str(aln) + ".html\""
                    print(to_print)

                    f = open(str(aln) + ".html", "w")
                    f.write(r.text)
                    f.close()

                    # Show primer pair to user
                    html_file = string.split(r.text, "\n")
                    i = 1
                    for line in html_file:
                        if "degen_corr" in line:
                            seq = line.split(" ")[0].strip()

                            description = line.split(" ")[2].strip()

                            this_id = this_file + "_" + line.split(" ")[1].strip()
                            this_id += "_" + str(i)

                            seq = Seq(seq, generic_dna)
                            seq_record = SeqRecord(seq)
                            seq_record.id = this_id
                            seq_record.description = description
                            primers.append(seq_record)
                            i = int(i)
                            i = i + 1

                        if i == 3:
                            break

            # Write primers to alignment file
            SeqIO.write(primers, "primers.fasta", "fasta")
            print("\nDone.\nAll primers have been saved in the file \"primers.fasta\"")

        else:
            print("\nError! the folder \"%s\" is empty.\n" % folder)

    else:
        print("\nError! the folder \"%s\" does not exists.\n" % folder)