Source code for pyphylogenomics.NGS

#!/usr/bin/env python

'''
===
NGS
===

Prepares output data from sequencing round in IonTorrent (FASTQ format file).

* Changes quality format from Phred to Solexa (which is required by the fastx-toolkit).
* Changes sequences id to incremental numbers.
* Creates temporal FASTA file.

This module also finds matches between reads and expected gene regions by using
BLASTn, and separates reads based on matches against indexes.

It also has functions to do assembly of reads using ``velvet``.
'''
import glob
import multiprocessing
import os
import re
import requests
import shutil
import subprocess
import sys

from Bio import SeqIO


[docs]def prepare_data(ionfile, index_length):
    '''
    * Changes quality format from Phred to Solexa (which is required by the fastx-toolkit).
    * Changes sequences id to incremental numbers.
    * Creates temporal FASTA file with the indexes removed from the sequences.

    Files generated will be written to folder ``data/modified/``

    * ``ionfile`` argument is FASTQ format file as produced by IonTorrent
    * ``index_length`` number of base pairs of your indexes. This is necessary \
                       to trim the indexes before blasting the FASTA file      \
                       against the reference gene sequences.

    Example:

    >>> from pyphylogenomics import NGS
    >>> ionfile = "ionrun.fastq"
    >>> index_length = 8
    >>> NGS.prepare_data(ionfile, index_length)
    Your file has been saved using Solexa quality format as data/modified/wrk_ionfile.fastq
    Your sequence IDs have been changed to numbers.
    The FASTA format file data/modified/wrk_ionfile.fasta has been created.
    '''
    # create folder to keep data
    folder = os.path.join("data", "modified")
    if not os.path.exists(folder):
        os.makedirs(folder)

    # change quality format from Phred to Solexa (required by fastx-toolkit)
    # write file to work on
    wrkfile = os.path.join(folder, "wrk_ionfile.fastq")
    SeqIO.convert(ionfile, "fastq", wrkfile, "fastq-solexa")
    print("Your file has been saved using Solexa quality format as " + wrkfile)

    # change sequences id to incremental numbers
    command = "fastx_renamer -n COUNT -i " + wrkfile + " -o tmp.fastq"
    p = subprocess.check_call(command, shell=True)
    if p != 0:
        print("\nError, couldn't execute " + command)
        sys.exit()
    print("Your sequence IDs have been changed to numbers.")

    # replace working file with temporal file
    os.rename("tmp.fastq", wrkfile)

    # create temporal FASTA file
    command = "fastq_to_fasta -i " + wrkfile + " -o tmp.fasta"
    p = subprocess.check_call(command, shell=True)

    # trim index region
    index_length = int(index_length) + 1
    command = "fastx_trimmer -f " + str(index_length) + " -i tmp.fasta "
    command += "-o " + os.path.join(folder, "wrk_ionfile.fasta")
    p = subprocess.check_call(command, shell=True)

    if os.path.isfile("tmp.fasta"):
        os.remove("tmp.fasta")

    print("The FASTA format file " + os.path.join(folder, "wrk_ionfile.fasta")
          + " has been created.")


[docs]def parse_blast_results(blast_table, ion_file):
    '''
    This function uses the BLAST results from a CSV file and separates the
    IonTorrent reads into bins that match each of the target genes (or reference
    sequences).
    To speed up things a few tricks are made:
    * Split CSV file into several chunks.
    * Split ionfile.fastq into complementary chunks so that the reads in the CSV chunks will be found in our fastq chunks.
    Reads will be written into separate FASTQ files, one per matching target
    gene.
    These files will be written in the folder ``output``.

    This step will accept matching reads that align more than 40bp to the
    expected gene sequence. Function :py:func:`NGS.filter_reads`

    '''
    # create folder to keep output data
    folder = "output"
    if not os.path.exists(folder):
        os.makedirs(folder)

    # split results file into chunks
    p = split_results_file(blast_table)
    if p != 0:
        print("\nError, couldn't execute parse_blast_results")
        sys.exit()

    # move splitted blast results to output/
    for file in glob.iglob("_re*"):
        src = file
        file += ".csv"
        dest = os.path.join("output", file)
        os.rename(src, dest)

    jobs = []

    # split fastq file into chunks
    for blast_chunk in glob.iglob(os.path.join("output", "_re*.csv")):
        jobs.append(multiprocessing.Process(target=split_ionfile_by_results, args=(ion_file, blast_chunk)))

    for job in jobs:
        job.start()
        job.join()

    # progressbar
    progressbar_width = 20
    sys.stdout.write("Progress: [%s]" % (" " * progressbar_width))
    sys.stdout.flush()
    sys.stdout.write("\b" * (progressbar_width + 1))

    # now we need to iterate over each chunk of blast result and fastq file
    # and separate the reads into bins according to matching gene
    # filter_reads(ion_chunk, blast_chunk):
    # folder is the "output" folder that we are using to keep our result data
    jobs = []
    for ion_chunk in glob.iglob(os.path.join("output", "_re*.fastq")):
        blast_chunk = ion_chunk[:-5] + "csv"
        jobs.append(multiprocessing.Process(target=filter_reads, args=(ion_chunk, blast_chunk, folder)))

    for job in jobs:
        sys.stdout.write("#")
        job.start()
        job.join()
        sys.stdout.flush()

    sys.stdout.write("\n")


[docs]def separate_by_index(fastq_file, index_list, folder="", levenshtein_distance=1):
    from Bio import SeqIO
    from pyphylogenomics import NGS

    '''
    This function divides FASTQ reads into bins according to a list of indexes
    (or barcodes).
    The *index_list* should be in FASTA format.
    It will compare the template indexes and those in the reads and accept
    indexes with a difference no bigger than the *levenshtein* distance (default
    1 base pair difference).

    See http://en.wikipedia.org/wiki/Levenshtein_distance

    * ``fastq_file`` FASTQ format containing reads as produced by IonTorrent
    * ``index_list`` FASTA format file containing indexes (or barcodes)
    * ``folder`` *Optional*: Directory containing FASTQ format files to process
    * ``levenshtein_distance`` *Optional*, default = 1: Maximum number of different nucleotides that will be accepted when comparing template and sequenced indexes (due to errors in base calling during sequencing).

    Example:

    >>> from pyphylogenomics import NGS
    >>> fastq_file = "gene_rps5.fastq"
    >>> index_list = "indexes.fasta"
    >>> folder     = "output"
    >>> NGS.separate_by_index(fastq_file, index_list, folder)

    You can also automate parsing many FASTQ files at once:

    >>> from pyphylogenomics import NGS
    >>> import glob; # this module allow us selecting many files by using wildcards
    >>> index_list = "indexes.fasta"
    >>> folder     = "output"
    >>> for file in glob.glob("output/gene*.fastq"):
    ...     NGS.separate_by_index(file, index_list, folder)
    '''
    print("Processing file " + fastq_file)
    if folder != "":
        folder = re.sub("/$", "", folder)
        folder = os.path.abspath(folder)
        print("Output files will be written into " + folder)

    for seq_record in SeqIO.parse(index_list, "fasta"):
        for fastq_record in SeqIO.parse(fastq_file, "fastq"):
            found_index = ""
            found_index = NGS.find_index_in_seq(seq_record, fastq_record, levenshtein_distance)
            if found_index == "TRUE":
                basename = os.path.basename(fastq_file)
                if folder != "":
                    filename = "index_" + str(seq_record.id) + "_" + re.sub(".fastq", "", basename) + ".fastq"
                    filename = os.path.join(folder, filename)
                else:
                    filename = "index_" + str(seq_record.id) + "_" + re.sub(".fastq", "", basename) + ".fastq"
                output_handle = open(filename, "a")
                SeqIO.write(fastq_record, output_handle, "fastq")
                output_handle.close()


[docs]def assembly(fastq_file, index_length, min_quality=20, percentage=70, min_length=50):
    '''
    Do *de novo* assembly of expected sequences after doing quality control of a
    FASTQ file. Quality control includes dropping reads with low quality values,
    trimming of bad quality end and trimming index region.

    * ``fastq_file`` FASTQ format file that ideally has been separated by gene and index using the functions :py:func:`NGS.parse_blast_results` and :py:func:`NGS.separate_by_index`
    * ``index_length`` number of base pairs of the indexes. They will be trimmed from the reads during processing.
    * ``min_quality`` minimum quality score to keep (20 by default)
    * ``percentage``  minimum percent of base pairs that need to have ``min_quality`` (70% by default)
    * ``min_length`` minimum length of read sequences to keep (50 by default)

    Example:

    >>> from pyphylogenomics import NGS
    >>> fastq_file   = "index_Ion_4_gene_rps5.fastq"
    >>> index_length = 8
    >>> min_quality  = 30; # optional
    >>> percentage   = 80; # optional
    >>> min_length   = 60; # optional
    >>> NGS.assembly(fastq_file, index_length, min_quality, percentage, min_length)
    '''
    # Do quality control first
    q = quality_control(fastq_file, index_length, min_quality, percentage, min_length)
    try:
        with open('assembly_velvet.sh'):
            pass
    except IOError:
        # downloading scripts to local folder
        r = requests.get("https://raw.github.com/carlosp420/PyPhyloGenomics/master/assembly_velvet.sh")
        f = open("assembly_velvet.sh", "w")
        f.write(r.content)
        f.close()

        r = requests.get("https://raw.github.com/carlosp420/PyPhyloGenomics/master/assembly_velvet2.sh")
        f = open("assembly_velvet2.sh", "w")
        f.write(r.content)
        f.close()

    if q == "ok":
        command = "bash assembly_velvet.sh filter3.fastq"
        filter3_output = subprocess.check_output(command, shell=True)
        filter3_params = get_velvet_params(filter3_output)

        best_input_kmer = guess_best_kmer(filter3_params)
        command = "bash assembly_velvet2.sh " + best_input_kmer[0] + ".fastq "
        command += str(best_input_kmer[1])
        assembly = subprocess.check_call(command, shell=True)

        if assembly == 0:
            if count_reads("test/contigs.fa", "fasta") > 0:
                print("The assembly produced " + str(count_reads("test/contigs.fa", "fasta")) + " potential contigs")
                filename = re.sub(".fastq$", "", fastq_file) + "_assembled.fasta"
                os.rename("test/contigs.fa", filename)
                print("Assembled sequence has been saved as file " + filename)
    else:
        print("Couldn't process file " + fastq_file)


[docs]def count_reads(fastqFile, file_format):
    '''
    \* *Internal function* \*
    '''
    count = 0
    for seq_record in SeqIO.parse(fastqFile, file_format):
        count = count + 1
    return(count)


# ----------------------------------------------------------------------------
# @input: output from runing velvet assembly on all Kmer values
# @output: a dictionary with the parameters: kmer, nodes, n50, max, total
def get_velvet_params(output):
    '''
    \* *Internal function* \*
    '''
    output = output.split("\n")
    mydict = dict()
    kmer = 31
    for line in output:
        if "n50" in line:
            lista = dict()
            nodes = re.search("(\d+)\snodes", line)
            nodes = nodes.groups()[0]
            lista['nodes'] = nodes

            n50 = re.search("n50 of (\d+)", line)
            n50 = n50.groups()[0]
            lista['n50'] = n50

            maxim = re.search("max\s(\d+)", line)
            maxim = maxim.groups()[0]
            lista['max'] = maxim

            total = re.search("total\s(\d+)", line)
            total = total.groups()[0]
            lista['total'] = total

            mydict[kmer] = lista
            kmer = kmer - 2
    return(mydict)


# ----------------------------------------------------------------------------
# @input: params from two runs of velvet on all Kmer values
#            these inputs are dictionaries
# @output: a list containing:
#            - the filtered file number either filter2 or filter3
#            - the best kmer value found by comparison of the two
[docs]def guess_best_kmer(filter3_params):
    '''
    \* *Internal function* \*
    '''
    n50 = []
    for i in filter3_params:
        n50.append(int(filter3_params[i]['n50']))
        n50.sort()
        n50.reverse()
    filter3_n50 = n50[0]

    for i in filter3_params:
        if filter3_params[i]['n50'] == str(filter3_n50):
            return(["filter3", i])


# ----------------------------------------------------------------------------
# @input: output from running velvet assembly on all Kmer values
# @output: a dictionary with the parameters: kmer, nodes, n50, max, total
[docs]def get_velvet_params(output):
    '''
    \* *Internal function* \*
    '''
    output = output.split("\n")
    mydict = dict()
    kmer = 31
    for line in output:
        if "n50" in line:
            lista = dict()
            nodes = re.search("(\d+)\snodes", line)
            nodes = nodes.groups()[0]
            lista['nodes'] = nodes

            n50 = re.search("n50 of (\d+)", line)
            n50 = n50.groups()[0]
            lista['n50'] = n50

            maxim = re.search("max\s(\d+)", line)
            maxim = maxim.groups()[0]
            lista['max'] = maxim

            total = re.search("total\s(\d+)", line)
            total = total.groups()[0]
            lista['total'] = total

            mydict[kmer] = lista
            kmer = kmer - 2
    return(mydict)


[docs]def quality_control(fastq_file, index_length, min_quality=20, percentage=70, min_length=50):
    '''
    \* *Internal function* \*

    Use fastx-tools to do quality control on FASTQ file.
    '''

    command = "fastq_quality_filter -q " + str(min_quality) + " -p " + str(percentage)
    command += " -i " + fastq_file + " -o filter1.fastq"
    p = subprocess.check_call(command, shell=True)
    print("Processing file " + fastq_file)

    index_length = index_length + 1
    command = "fastx_trimmer -f " + str(index_length) + " -i filter1.fastq -o filter2.fastq"
    p = subprocess.check_call(command, shell=True)
    print("Removing indexes")

    command = "fastq_quality_trimmer -t " + str(min_quality) + " -l "
    command += str(min_length) + " -i filter2.fastq -o filter3.fastq"
    p = subprocess.check_call(command, shell=True)
    print("Trimming low quality end")

    if p == 0:
        return("ok")


[docs]def find_index_in_seq(barcode, seq, levenshtein_distance):
    '''
    Internal function

    Arguments:  *barcode*,  *read sequence*.
    Iterate through primer's degenerated IUPAC and try to find it in sequence
    read.
    Return TRUE on success.
    '''
    found = "false"

    while (found != "true"):
        barcode_seq = str(barcode.seq)
        barcode_read = str(seq.seq)[0:len(barcode_seq)]

        # compare levenshtein
        distance = levenshtein(barcode_seq.upper(), barcode_read.upper())

        if distance < levenshtein_distance + 1:
            # accept
            found = "true"

            # if str(seq.id) not in fastq_id_list:
            #     fastq_id_list.append(str(seq.id))
            return("TRUE")
        else:
            # reversecomplement
            barcode_seq = str(barcode.seq.reverse_complement())

            # compare levenshtein
            distance = levenshtein(barcode_seq.upper(), barcode_read.upper())

            if distance < levenshtein_distance + 1:
                # print "Found reverse complement"
                # accept
                found = "true"

                # if str(seq.id) not in fastq_id_list:
                #     fastq_id_list.append(str(seq.id))
                return("TRUE")

        return("FALSE")


[docs]def levenshtein(a, b):
    '''
    \* *Internal function* \*

    Calculates the Levenshtein distance between a and b.
    '''
    n, m = len(a), len(b)
    if n > m:
        # Make sure n <= m, to use O(min(n,m)) space
        a, b = b, a
        n, m = m, n

    current = range(n + 1)
    for i in range(1, m + 1):
        previous, current = current, [i] + [0] * n
        for j in range(1, n + 1):
            add, delete = previous[j] + 1, current[j - 1] + 1
            change = previous[j - 1]
            if a[j - 1] != b[i - 1]:
                change = change + 1
            current[j] = min(add, delete, change)

    return(current[n])


[docs]def split_results_file(blast_table):
    '''
    \* *Internal function* \*

    Split a CSV format result file from a BLAST in chunks.
    '''
    # build an slicer for Python
    command = "split -l 10000 " + blast_table + " _re"
    p = subprocess.check_call(command, shell=True)
    return(p)


[docs]def split_ionfile_by_results(ion_file, blast_chunk):
    '''
    \* *Internal function* \*

    This function divides an IonTorrent run file into chunks that match chunks
    of a BLAST results file in CSV format.
    '''

    # get first id from result chunk
    with open(blast_chunk) as myfile:
        head = myfile.readline()
        head = head.split(",")
        first_id = "@" + head[0]

    # get last id from result chunk
    number_of_lines_in_chunk = sum(1 for line in open(blast_chunk))
    fp = open(blast_chunk)
    for i, line in enumerate(fp):
        if i == number_of_lines_in_chunk - 1:
            last_id = "@" + line.split(",")[0]
    fp.close()

    # get line with first and last ids in ionfile
    fp = open(ion_file)
    first_line = 0
    last_line = 0
    for i, line in enumerate(fp):
        if line.strip() == first_id:
            first_line = i
        if line.strip() == last_id:
            # adding 3 lines due to this being FASTQ file
            last_line = i + 3
    fp.close()

    # cut ionfile using first and last lines
    # _reaa.fastq
    ion_chunk = open(blast_chunk[:-4] + ".fastq", "w")

    fp = open(ion_file)
    for i, line in enumerate(fp):
        if i >= first_line:
            ion_chunk.write(line)
        if i > last_line - 1:
            print("Splitting " + ion_file + " into chunk " + blast_chunk[:-4] + ".fastq")
            break
    fp.close()

    ion_chunk.close()

    # return ion_chunk filename
    return(blast_chunk + ".fastq")


[docs]def prune(folder, blast_data, seq_record, ion_id, min_aln_length):
    '''
    \* *Internal function* \*

    Takes a list of BLAST results and gets the gene_ids to save the current
    FASTQ seq_record into a file.
    It alse removes from the list those results that have been saved to a file
    and returns the list.
    '''
    from Bio import SeqIO
    list = []
    for i in blast_data:
        if str(i.split(",")[0]) == str(ion_id):
            dic = {}
            dic['line'] = i
            i = i.split(",")
            dic['gene_id'] = i[1]
            dic['ion_id'] = ion_id
            dic['al_length'] = i[3]
            list.append(dic)

    # avoid saving the same seq_record into the gene bin twice
    last_saved = ""
    for i in list:
        if last_saved != i['gene_id']:
            if int(i['al_length']) > int(min_aln_length):
                filename = os.path.join(folder, "gene_" + i['gene_id'] + ".fastq")
                exon_out = open(filename, "a")
                SeqIO.write(seq_record, exon_out, "fastq")
                exon_out.close()
                last_saved = i['gene_id']
        blast_data.remove(i['line'])

    return(blast_data)


[docs]def filter_reads(ion_chunk, blast_chunk, folder):
    from Bio import SeqIO
    from pyphylogenomics import NGS
    '''
    \* *Internal function* \*

    Accepting alignment lengths higher than 40 bp
    longer than our primer lengths
    '''
    min_aln_length = 40

    blast_file = open(blast_chunk, "r")
    tmp = blast_file.readlines()
    blast_file.close()

    blast_data = []
    for i in tmp:
        blast_data.append(i.strip())

    # iterate over ion torrent reads
    for seq_record in SeqIO.parse(ion_chunk, "fastq"):
        if len(blast_data) > 0:
            # print "\n\nNew record--------------------"
            # print "seq record id @%s" % seq_record.id
            # avoid processing seq_records that are not in blast file
            # first id in blast_data
            # print blast_data
            first_id_in_blast_data = blast_data[0].split(",")[0]
            # print "fist id in blast_data %s" % first_id_in_blast_data

            if int(seq_record.id) >= int(first_id_in_blast_data):
                # if str(seq_record.id) == ion_id and aln_length > min_aln_length:
                if str(seq_record.id) == first_id_in_blast_data:
                    # print "prune"
                    blast_data = NGS.prune(folder, blast_data, seq_record,
                                           first_id_in_blast_data, min_aln_length)
                else:
                    break