changeset 6:fe3ceb5c4214 draft

Uploaded
author jpetteng
date Fri, 05 Jan 2018 15:43:14 -0500
parents a202cc394af8
children b9146bf9de08
files ._ecoli_serotyping ecoli_serotyping/._blastFunctions.py ecoli_serotyping/._commandLineOptions.py ecoli_serotyping/._definitions.py ecoli_serotyping/._ectyper.py ecoli_serotyping/._ectyper.xml ecoli_serotyping/._genomeFunctions.py ecoli_serotyping/._loggingFunctions.py ecoli_serotyping/._predictionFunctions.py ecoli_serotyping/._speciesIdentification.py ecoli_serotyping/._subprocess_util.py ecoli_serotyping/blastFunctions.py ecoli_serotyping/commandLineOptions.py ecoli_serotyping/definitions.py ecoli_serotyping/ectyper.py ecoli_serotyping/ectyper.xml ecoli_serotyping/genomeFunctions.py ecoli_serotyping/loggingFunctions.py ecoli_serotyping/predictionFunctions.py ecoli_serotyping/speciesIdentification.py ecoli_serotyping/subprocess_util.py
diffstat 20 files changed, 1263 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
Binary file ._ecoli_serotyping has changed
Binary file ecoli_serotyping/._blastFunctions.py has changed
Binary file ecoli_serotyping/._commandLineOptions.py has changed
Binary file ecoli_serotyping/._definitions.py has changed
Binary file ecoli_serotyping/._ectyper.py has changed
Binary file ecoli_serotyping/._ectyper.xml has changed
Binary file ecoli_serotyping/._genomeFunctions.py has changed
Binary file ecoli_serotyping/._loggingFunctions.py has changed
Binary file ecoli_serotyping/._predictionFunctions.py has changed
Binary file ecoli_serotyping/._speciesIdentification.py has changed
Binary file ecoli_serotyping/._subprocess_util.py has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ecoli_serotyping/blastFunctions.py	Fri Jan 05 15:43:14 2018 -0500
@@ -0,0 +1,113 @@
+#!/usr/bin/env python
+
+"""
+Functions for setting up, running, and parsing blast
+"""
+import logging
+import os
+
+from ectyper import subprocess_util
+
+LOG = logging.getLogger(__name__)
+
+
+def create_blast_db(filelist, temp_dir):
+    """http://stackoverflow.com/questions/23944657/typeerror-method-takes-1-positional-argument-but-2-were-given
+    Creating a blast DB using the makeblastdb command.
+    The database is created in the temporary folder of the system.
+
+    Args:
+        filelist: list of genomes that was given by the user on the command line.
+        temp_dir: temporary directory to store the blastdb in.
+
+    Returns:
+        Full path to the DB
+    """
+    blast_db_path = os.path.join(temp_dir, 'ectyper_blastdb')
+
+    LOG.debug("Generating the blast db at {0}".format(blast_db_path))
+    cmd = [
+        "makeblastdb",
+        "-in", ' '.join(filelist),
+        "-dbtype", "nucl",
+        "-title", "ectyper_blastdb",
+        "-out", blast_db_path]
+    subprocess_util.run_subprocess(cmd)
+
+    return blast_db_path
+
+
+def run_blast(query_file, blast_db, args):
+    """
+    Execute a blastn run given the query files and blastdb
+
+    Args:
+        query_file (str): one or both of the VF / Serotype input files
+        blast_db (str): validated fasta files from the user, in DB form
+        args (Namespace object): parsed commadnline options from the user
+        chunck_size: number of genomes in the database
+
+    Returns:
+        The blast output file
+    """
+    percent_identity = args.percentIdentity
+    percent_length = args.percentLength
+
+    LOG.debug('Running blast query {0} against database {1} '.format(
+        query_file, blast_db))
+
+    blast_output_file = blast_db + '.output'
+
+    cmd = [
+        "blastn",
+        "-query", query_file,
+        "-db", blast_db,
+        "-out", blast_output_file,
+        '-perc_identity', str(percent_identity),
+        '-qcov_hsp_perc', str(percent_length),
+        '-max_hsps', '1', # each allele only need to hit once
+        # use default max_target_seqs=500
+        "-outfmt",
+        '6 qseqid qlen sseqid length pident sstart send sframe qcovhsp',
+        "-word_size", "11"
+    ]
+    subprocess_util.run_subprocess(cmd)
+    with open(blast_output_file, mode='rb') as fh:
+        for line in fh:
+            LOG.debug(line.decode('ascii'))
+    return blast_output_file
+
+def run_blast_for_identification(query_file, blast_db):
+    """
+    Execute a blastn run given the query files and blastdb
+    with special configuration for high performance identification
+
+    Args:
+        query_file: one or both of the VF / Serotype input files
+        blast_db: validated fasta files from the user, in DB form
+
+    Returns:
+        blast_output_file (str): path to the blast output file
+    """
+
+    LOG.debug('Running blast query {0} against database {1} '.format(
+        query_file, blast_db))
+
+    blast_output_file = blast_db + '.output'
+
+    cmd = [
+        "blastn",
+        "-query", query_file,
+        "-db", blast_db,
+        "-out", blast_output_file,
+        '-perc_identity', '90',
+        '-qcov_hsp_perc', '90',
+        '-max_target_seqs', '1',  # we only want to know hit/no hit
+        # 10 query seq, we want at most 1 hit each
+        "-outfmt",
+        '6 qseqid qlen sseqid length pident sstart send sframe',
+        "-word_size", "11"
+    ]
+    subprocess_util.run_subprocess(cmd)
+
+    return blast_output_file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ecoli_serotyping/commandLineOptions.py	Fri Jan 05 15:43:14 2018 -0500
@@ -0,0 +1,85 @@
+#!/usr/bin/env python
+
+import argparse
+
+
+def parse_command_line(args=None):
+    """
+    The options for both the serotyper, and virulence finder.
+    The returned object is used by both, but the options do not
+    necessarily apply to both.
+
+    Args:
+        args: Optional args to be passed to argparse.parse_args()
+
+    Returns:
+        The populated argparse Namespace
+    """
+
+    def check_percentage(value):
+        """
+        type checker for percentage input
+        """
+        ivalue = int(value)
+        if ivalue <= 0 or ivalue > 100:
+            raise argparse.ArgumentTypeError(
+                "{0} is an invalid positive int percentage value".format(value)
+            )
+        return ivalue
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-i",
+        "--input",
+        help="Location of new file(s). Can be a single file or \
+            a directory",
+        required=True
+    )
+
+    parser.add_argument(
+        "-d",
+        "--percentIdentity",
+        type=check_percentage,
+        help="Percentage of identity wanted to use against the\
+                  database. From 0 to 100, default is 90%%.",
+        default=90
+    )
+
+    parser.add_argument(
+        "-l",
+        "--percentLength",
+        type=check_percentage,
+        help="Percentage of length wanted to use against the \
+                  database. From 0 to 100, default is 50%%.",
+        default=50
+    )
+
+    parser.add_argument(
+        "--verify",
+        action="store_true",
+        help="Enable E. Coli. verification"
+    )
+
+    parser.add_argument(
+        "-s",
+        "--species",
+        action="store_true",
+        help="Enable species identification when non-ecoli genome is found\n\
+            Note: refseq downloading is required when running this option for the first time."
+    )
+
+    parser.add_argument(
+        '--detailed',
+        action='store_true',
+        help='Enable detailed output'
+    )
+
+    parser.add_argument(
+        "-o",
+        "--output",
+        help="Directory location of output files."
+    )
+
+    if args is None:
+        return parser.parse_args()
+    return parser.parse_args(args)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ecoli_serotyping/definitions.py	Fri Jan 05 15:43:14 2018 -0500
@@ -0,0 +1,20 @@
+#!/usr/bin/env python
+
+"""
+    Definitions for the ectyper project
+"""
+
+import os
+
+ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
+DATA_DIR = os.path.join(ROOT_DIR, 'Data')
+WORKPLACE_DIR = os.getcwd()
+
+SEROTYPE_FILE = os.path.join(DATA_DIR, 'ectyper_data.fasta')
+SEROTYPE_ALLELE_JSON = os.path.join(DATA_DIR, 'ectyper_dict.json')
+COMBINED = os.path.join(DATA_DIR, 'combined.fasta')
+
+ECOLI_MARKERS = os.path.join(DATA_DIR, 'ecoli_specific_markers.fasta')
+SAMTOOLS = 'samtools'
+REFSEQ_SUMMARY = os.path.join(DATA_DIR, 'assembly_summary_refseq.txt')
+REFSEQ_SKETCH = os.path.join(DATA_DIR, 'refseq.genomes.k21s1000.msh')
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ecoli_serotyping/ectyper.py	Fri Jan 05 15:43:14 2018 -0500
@@ -0,0 +1,303 @@
+#!/usr/bin/env python
+"""
+    Predictive serotyping for _E. coli_.
+"""
+import logging
+import os
+import sys
+import tempfile
+import datetime
+from urllib.request import urlretrieve
+
+from ectyper import (blastFunctions, commandLineOptions, definitions,
+                     genomeFunctions, loggingFunctions, predictionFunctions,
+                     speciesIdentification)
+
+LOG_FILE = loggingFunctions.initialize_logging()
+LOG = logging.getLogger(__name__)
+
+
+def run_program():
+    """
+    Wrapper for both the serotyping and virulence finder
+    The program needs to do the following
+    (1) Filter genome files based on format
+    (2) Filter genome files based on species
+    (3) Map FASTQ files to FASTA seq
+    (1) Get names of all genomes being tested
+    (2) Create a BLAST database of those genomes
+    (3) Query for serotype
+    (4) Parse the results
+    (5) Display the results
+    :return: success or failure
+
+    """
+    # Initialize the program
+    args = commandLineOptions.parse_command_line()
+    LOG.info('Starting ectyper\nSerotype prediction with input:\n \
+            {0}\n \
+            Log file is: {1}'.format(args, LOG_FILE))
+    LOG.debug(args)
+
+    ## Initialize temporary directories for the scope of this program
+    with tempfile.TemporaryDirectory() as temp_dir:
+        temp_files = create_tmp_files(temp_dir, output_dir=args.output)
+        LOG.debug(temp_files)
+
+        # Download refseq file if speicies identification is enabled
+        if args.species:
+            download_refseq()
+
+        LOG.info("Gathering genome files")
+        raw_genome_files = genomeFunctions.get_files_as_list(args.input)
+        LOG.debug(raw_genome_files)
+
+        LOG.info("Removing invalid file types")
+        raw_files_dict = get_raw_files(raw_genome_files)
+        LOG.debug(raw_files_dict)
+
+        # Assembling fastq + verify ecoli genome
+        LOG.info("Preparing genome files for blast alignment")
+        final_fasta_files = filter_for_ecoli_files(
+            raw_files_dict, temp_files, verify=args.verify, species=args.species
+        )
+        LOG.debug(final_fasta_files)
+        if len(final_fasta_files) is 0:
+            LOG.info("No valid genome files. Terminating the program.")
+            exit(0)
+
+        LOG.info("Standardizing the genome headers")
+        (all_genomes_list, all_genomes_files) = \
+            genomeFunctions.get_genome_names_from_files(
+                final_fasta_files, temp_files['fasta_temp_dir'])
+        LOG.debug(all_genomes_list)
+        LOG.debug(all_genomes_files)
+
+        # Main prediction function
+        predictions_file = run_prediction(all_genomes_files, args,
+                                          temp_files['output_file'])
+
+        # Add empty rows for genomes without blast result
+        predictions_file = predictionFunctions.add_non_predicted(
+            all_genomes_list, predictions_file)
+        LOG.info('Outputs stored in {0}'.format(temp_files['output_dir']))
+
+        # Store most recent result in working directory
+        LOG.info('\nReporting result...')
+        predictionFunctions.report_result(predictions_file)
+
+def download_refseq():
+    '''Download refseq file with progress bar
+    '''
+
+
+    def reporthook(blocknum, blocksize, totalsize):
+        '''
+        https://stackoverflow.com/questions/15644964/python-progress-bar-and-downloads
+        '''
+        readsofar = blocknum * blocksize
+        if totalsize > 0:
+            s = "\r {:5.1%} {:{}d} / {:d}".format(
+                readsofar/totalsize, readsofar,
+                len(str(totalsize)),
+                totalsize
+            )
+            sys.stderr.write(s)
+            if readsofar >= totalsize: # near the end
+                sys.stderr.write("\n")
+        else: # total size is unknown
+            sys.stderr.write("read {}\n".format(readsofar))
+
+    if not os.path.isfile(definitions.REFSEQ_SKETCH):
+        refseq_url = 'https://gembox.cbcb.umd.edu/mash/refseq.genomes.k21s1000.msh'
+        LOG.info("No refseq found. Downloading reference file for species identification...")
+        urlretrieve(refseq_url, definitions.REFSEQ_SKETCH, reporthook)
+        LOG.info("Download complete.")
+
+def create_tmp_files(temp_dir, output_dir=None):
+    """Create a dictionary of temporary files used by ectyper
+
+    Args:
+        temp_dir: program scope temporary directory
+        output_dir(str, optional):
+            directory to store output
+
+    Return:
+        a dictionary of temporary files
+        example:
+            {'assemble_temp_dir': 'test/temp/assemblies',
+             'fasta_temp_dir': 'test/temp/fastas',
+             'output_dir': os.path.abspath('output')+'/',
+             'output_file': os.path.abspath('output/output.csv')}
+    """
+
+    # Get the correct files and directories
+    files_and_dirs = {
+        'assemble_temp_dir': os.path.join(temp_dir, 'assemblies'),
+        'fasta_temp_dir': os.path.join(temp_dir, 'fastas'),
+    }
+    if output_dir is None:
+        output_dir = ''.join([
+            str(datetime.datetime.now().date()),
+            '_',
+            str(datetime.datetime.now().time()).replace(':', '.')
+        ])
+    if os.path.isabs(output_dir):
+        pass
+    else:
+        output_dir = os.path.join(definitions.WORKPLACE_DIR, 'output', output_dir)
+
+    output_file = os.path.join(output_dir, 'output.csv')
+    if os.path.isfile(output_file):
+        os.remove(output_file)
+    for d in [
+            output_dir, files_and_dirs['assemble_temp_dir'],
+            files_and_dirs['fasta_temp_dir']
+    ]:
+        if not os.path.exists(d):
+            os.makedirs(d)
+
+    # Finalize the tmp_files dictionary
+    files_and_dirs['output_file'] = output_file
+    files_and_dirs['output_dir'] = output_dir
+
+    LOG.info("Temporary files and directory created")
+    return files_and_dirs
+
+
+def run_prediction(genome_files, args, predictions_file):
+    '''Core prediction functionality
+    
+    Args:
+        genome_files:
+            list of genome files
+        args:
+            commandline arguments
+        predictions_file:
+            filename of prediction output
+    
+    Returns:
+        predictions_file with prediction written in it
+    '''
+    query_file = definitions.SEROTYPE_FILE
+    ectyper_dict_file = definitions.SEROTYPE_ALLELE_JSON
+    # create a temp dir for blastdb
+    with tempfile.TemporaryDirectory() as temp_dir:
+        # Divide genome files into chunks
+        chunk_size = 50
+        genome_chunks = [
+            genome_files[i:i + chunk_size]
+            for i in range(0, len(genome_files), chunk_size)
+        ]
+        for index, chunk in enumerate(genome_chunks):
+            LOG.info("Start creating blast database #{0}".format(index + 1))
+            blast_db = blastFunctions.create_blast_db(chunk, temp_dir)
+
+            LOG.info("Start blast alignment on database #{0}".format(index + 1))
+            blast_output_file = blastFunctions.run_blast(query_file, blast_db, args)
+            LOG.info("Start serotype prediction for database #{0}".format(index + 1))
+            predictions_file = predictionFunctions.predict_serotype(
+                blast_output_file, ectyper_dict_file, predictions_file,
+                args.detailed)
+        return predictions_file
+
+
+def get_raw_files(raw_files):
+    """Take all the raw files, and filter not fasta / fastq
+    
+    Args:
+        raw_files(str): list of files from user input
+    
+    Returns:
+        A dictitionary collection of fasta and fastq files
+        example:
+        {'raw_fasta_files':[],
+         'raw_fastq_files':[]}
+    """
+    fasta_files = []
+    fastq_files = []
+
+    for file in raw_files:
+        file_format = genomeFunctions.get_valid_format(file)
+        if file_format == 'fasta':
+            fasta_files.append(file)
+        elif file_format == 'fastq':
+            fastq_files.append(file)
+
+    LOG.debug('raw fasta files: {}'.format(fasta_files))
+    LOG.debug('raw fastq files: {}'.format(fastq_files))
+
+    return({'fasta':fasta_files, 'fastq':fastq_files})
+
+
+def filter_for_ecoli_files(raw_dict, temp_files, verify=False, species=False):
+    """filter ecoli, identify species, assemble fastq
+    Assemble fastq files to fasta files,
+    then filter all files by reference method if verify is enabled,
+    if identified as non-ecoli, identify species by mash method if species is enabled.
+    
+    Args:
+        raw_dict{fasta:list_of_files, fastq:list_of_files}:
+            dictionary collection of fasta and fastq files
+        temp_file: temporary directory
+        verify(bool):
+            whether to perform ecoli verification
+        species(bool):
+            whether to perform species identification for non-ecoli genome
+    Returns:
+        List of filtered and assembled genome files in fasta format
+    """
+    final_files = []
+    for f in raw_dict.keys():
+        temp_dir = temp_files['fasta_temp_dir'] if f == "fasta" else temp_files['assemble_temp_dir']
+
+        for ffile in raw_dict[f]:
+            filtered_file = filter_file_by_species(
+                ffile, f, temp_dir, verify=verify, species=species)
+            if filtered_file is not None and \
+            genomeFunctions.get_valid_format(filtered_file) is not None:
+                final_files.append(filtered_file)
+
+    LOG.info('{} final fasta files'.format(len(final_files)))
+    return final_files
+
+def filter_file_by_species(genome_file, genome_format, temp_dir, verify=False, species=False):
+    """filter ecoli, identify species, assemble fastq
+    Assemble fastq file to fasta file,
+    then filter the file by reference method if verify is enabled,
+    if identified as non-ecoli, identify species by mash method if species is enabled.
+    
+    Args:
+        genome_file: input genome file
+        genome_format(str): fasta or fastq
+        temp_file: temporary directory
+        verify(bool):
+            whether to perform ecoli verification
+        species(bool):
+            whether to perform species identification for non-ecoli genome
+    Returns:
+        The filtered and assembled genome files in fasta format
+    """
+    combined_file = definitions.COMBINED
+    filtered_file = None
+    if genome_format == 'fastq':
+        iden_file, pred_file = \
+            genomeFunctions.assemble_reads(genome_file, combined_file, temp_dir)
+        # If no alignment resut, the file is definitely not E.Coli
+        if genomeFunctions.get_valid_format(iden_file) is None:
+            LOG.warning(
+                "{} is filtered out because no identification alignment found".format(genome_file))
+            return filtered_file
+        if not (verify or species) or speciesIdentification.is_ecoli_genome(
+                iden_file, genome_file, mash=species):
+            # final check before adding the alignment for prediction
+            if genomeFunctions.get_valid_format(iden_file) != 'fasta':
+                LOG.warning(
+                    "{0} is filtered out because no prediction alignment found".format(genome_file))
+                return filtered_file
+            filtered_file = pred_file
+    if genome_format == 'fasta':
+        if not (verify or species) \
+        or speciesIdentification.is_ecoli_genome(genome_file, mash=species):
+            filtered_file = genome_file
+    return filtered_file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ecoli_serotyping/genomeFunctions.py	Fri Jan 05 15:43:14 2018 -0500
@@ -0,0 +1,280 @@
+'''
+Genome Utilities
+'''
+#!/usr/bin/env python
+
+import logging
+import os
+import re
+import tempfile
+from tarfile import is_tarfile
+
+from Bio import SeqIO
+from ectyper import definitions, subprocess_util
+
+LOG = logging.getLogger(__name__)
+
+
+def get_files_as_list(file_or_directory):
+    """
+    Creates a list of files from either the given file, or all files within the
+    directory specified (where each file name is its absolute path).
+
+    Args:
+        file_or_directory (str): file or directory name given on commandline
+
+    Returns:
+        files_list (list(str)): List of all the files found.
+
+    """
+
+    files_list = []
+    if file_or_directory == '':
+        return files_list
+
+    if os.path.isdir(file_or_directory):
+        LOG.info("Gathering genomes from directory " + file_or_directory)
+
+        # Create a list containing the file names
+        for root, dirs, files in os.walk(file_or_directory):
+            for filename in files:
+                files_list.append(os.path.join(root, filename))
+    # check if input is concatenated file locations
+    elif ',' in file_or_directory:
+        LOG.info("Using genomes in the input list")
+        for filename in file_or_directory.split(','):
+            files_list.append(os.path.abspath(filename))
+    else:
+        LOG.info("Using genomes in file " + file_or_directory)
+        files_list.append(os.path.abspath(file_or_directory))
+
+    return sorted(files_list)
+
+
+def get_valid_format(file):
+    """
+    Check using SeqIO if files are valid fasta/fastq format, returns the format.
+
+    Args:
+        file (str): path of file
+
+    Returns:
+        fm (str or None): the file format if 'fasta' or 'fastq', otherwise None
+    """
+    for fm in ['fastq', 'fasta']:
+        try:
+            with open(file, "r") as handle:
+                data = SeqIO.parse(handle, fm)
+                if any(data):
+                    if is_tarfile(file):
+                        LOG.warning("Compressed file is not supported: {}".format(file))
+                        return None
+                    return fm
+        except FileNotFoundError as err:
+            LOG.warning("{0} is not found".format(file))
+            return None
+        except UnicodeDecodeError as err:
+            LOG.warning("{0} is not a valid file".format(file))
+            return None
+        except:
+            LOG.warning("{0} is an unexpected file".format(file))
+            return None
+    LOG.warning("{0} is not a fasta/fastq file".format(file))
+    return None
+
+
+def get_genome_names_from_files(files, temp_dir):
+    """
+    For each file:
+    Takes the first header from a fasta file and sends it to the get_genome_name
+    function. Returns the name of the genome. If the name of the file is to be
+    used as the genome name, creates a temporary file using >lcl|filename as the
+    first part of the header.
+
+    Args:
+        files (list): The list of files to get the genome names for
+        tempdir (str): A tempdir where the copied files will be stored
+
+    Returns:
+        tuple(list(str), list(str)): first list is genome names, second list is file names
+    """
+
+    list_of_genomes = []
+    list_of_files = []
+    for file in files:
+        # Always to use the filename as the genome name.
+        # This means we also need to create a new file adding
+        # the filename to each of the headers, so that downstream applications
+        # (eg. BLAST) can be used with the filename as genome name.
+
+        # get only the name of the file for use in the fasta header
+        file_base_name = os.path.basename(file)
+        file_path_name = os.path.splitext(file_base_name)[0]
+        n_name = file_path_name.replace(' ', '_')
+
+        # create a new file for the updated fasta headers
+        new_file = tempfile.NamedTemporaryFile(dir=temp_dir, delete=False).name
+
+        # add the new name to the list of files and genomes
+        list_of_files.append(new_file)
+        list_of_genomes.append(n_name)
+
+        with open(new_file, "w") as outfile:
+            with open(file) as infile:
+                for record in SeqIO.parse(infile, "fasta"):
+                    outfile.write(">lcl|" + n_name + "|" + record.description + "\n")
+                    outfile.write(str(record.seq) + "\n")
+
+    return list_of_genomes, list_of_files
+
+
+def get_genome_name(header):
+    """
+    Getting the name of the genome by hierarchy. This requires reading the first
+    fasta header from the file. It also assumes a single genome per file.
+
+    Args:
+        header (str): The header containing the record.
+
+    Returns:
+        genomeName (str): Name of the genome contained in the header.
+    """
+
+    re_patterns = (
+        # Look for lcl followed by the possible genome name
+        re.compile('(lcl\|[\w\-\.]+)'),
+
+        # Look for contigs in the wwwwdddddd format
+        re.compile('([A-Za-z]{4}\d{2})\d{6}'),
+
+        # Look for a possible genome name at the beginning of the record ID
+        re.compile('^(\w{8}\.\d)'),
+
+        # Look for ref, gb, emb or dbj followed by the possible genome name
+        re.compile('(ref\|\w{2}_\w{6}|gb\|\w{8}|emb\|\w{8}|dbj\|\w{8})'),
+
+        # Look for gi followed by the possible genome name
+        re.compile('(gi\|\d{8})'),
+
+
+        # Look for name followed by space, then description
+        re.compile('^([\w\-\.]+)\s+[\w\-\.]+')
+    )
+
+    # if nothing matches, use the full header as genome_name
+    genome_name = header
+    for rep in re_patterns:
+        m = rep.search(header)
+
+        if m:
+            genome_name = m.group(1)
+            break
+
+    return str(genome_name)
+
+def assemble_reads(reads, reference, temp_dir):
+    '''
+    Return path to assembled reads.
+    Assemble short reads by mapping to a reference genome.
+    Default output is the same as reads file
+        (basename+'iden.fasta' and basename+'pred.fasta').
+
+    Args:
+        reads (str): FASTQ/FQ format reads file
+        reference (str): FASTA format reference file
+        temp_dir (str): temp_dir for storing assembled files
+
+    Returns:
+        tuple(str, str): identifcation and prediction fasta file
+    '''
+    output = os.path.join(
+        temp_dir,
+        os.path.splitext(os.path.basename(reads))[0]+'.fasta'
+    )
+
+    # run once if index reference does not exist
+    index_path = \
+        os.path.join(
+            definitions.DATA_DIR,
+            'bowtie_index',
+            os.path.splitext(os.path.basename(reference))[0]
+        )
+    index_dir = os.path.split(index_path)[0]
+    if not os.path.isdir(index_dir):
+        LOG.info('Reference index does not exist. Creating new reference index at {}'.format(index_dir))
+        if not os.path.exists(index_dir):
+            os.makedirs(index_dir)
+        cmd1 = [
+            'bowtie2-build',
+            reference,
+            index_path
+        ]
+        subprocess_util.run_subprocess(cmd1)
+
+    cmd2 = [
+        'bowtie2',
+        '--score-min L,1,-0.5',
+        '--np 5',
+        '-x', index_path,
+        '-U', reads,
+        '-S', os.path.join(temp_dir, 'reads.sam')
+    ]
+    subprocess_util.run_subprocess(cmd2)
+
+    cmd3 = [
+        definitions.SAMTOOLS, 'view',
+        '-F 4',
+        '-q 1',
+        '-bS', os.path.join(temp_dir, 'reads.sam'),
+        '-o', os.path.join(temp_dir, 'reads.bam')
+    ]
+    subprocess_util.run_subprocess(cmd3)
+    cmd4 = [
+        definitions.SAMTOOLS, 'sort',
+        os.path.join(temp_dir, 'reads.bam'),
+        '-o', os.path.join(temp_dir, 'reads.sorted.bam'),
+    ]
+    subprocess_util.run_subprocess(cmd4)
+
+    shell_cmd = [
+        definitions.SAMTOOLS+' mpileup -uf', # mpileup
+        reference,
+        os.path.join(temp_dir, 'reads.sorted.bam'),
+        '|',
+        'bcftools call -c', # variant calling
+        '|',
+        'vcfutils.pl vcf2fq', # vcf to fq
+        '|',
+        'seqtk seq -A -', # fq to fasta
+        '>',
+        output
+    ]
+    subprocess_util.run_subprocess(' '.join(shell_cmd), is_shell=True)
+    return split_mapped_output(output)
+
+def split_mapped_output(file):
+    '''
+    Splits given fasta files into two file based on 'lcl' tags
+        in the seq header
+
+    Args:
+        file (str): path to input fasta file
+
+    Returns:
+        (str): path to ecoli identification fasta seq
+        (str): path to serotype prediction fasta seq
+    '''
+    identif_file = os.path.splitext(file)[0]+'.iden.fasta'
+    predict_file = os.path.splitext(file)[0]+'.pred.fasta'
+    identif_seqs = []
+    predict_seqs = []
+    for record in SeqIO.parse(file, 'fasta'):
+        if 'lcl' in record.description:
+            identif_seqs.append(record)
+        else:
+            predict_seqs.append(record)
+    with open(identif_file, "w") as output_handle:
+        SeqIO.write(identif_seqs, output_handle, "fasta")
+    with open(predict_file, "w") as output_handle:
+        SeqIO.write(predict_seqs, output_handle, "fasta")
+    return identif_file, predict_file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ecoli_serotyping/loggingFunctions.py	Fri Jan 05 15:43:14 2018 -0500
@@ -0,0 +1,45 @@
+#!/usr/bin/env python
+"""
+    Set up the logging
+"""
+
+import logging
+import os
+
+import ectyper.definitions as D
+
+
+def initialize_logging():
+    """
+    Set up the screen and file logging.
+
+    Args:
+        None
+        
+    Returns:
+        log_file (str): The log filename
+    """
+
+    # set up DEBUG logging to file, INFO logging to STDERR
+    log_file = os.path.join(D.WORKPLACE_DIR, 'ectyper.log')
+
+    formatter = logging.Formatter(
+        '%(asctime)s %(name)-12s %(levelname)-8s %(message)s')
+
+    # logging to file
+    logging.basicConfig(
+        level=logging.DEBUG,
+        format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s',
+        datefmt='%m-%d %H:%M',
+        filename=log_file,
+        filemode='w')
+
+    # define a Handler which writes INFO messages or higher to the sys.stderr
+    console = logging.StreamHandler()
+    console.setFormatter(formatter)
+    console.setLevel(logging.INFO)
+
+    # add the handler to the root logger
+    logging.getLogger('').addHandler(console)
+
+    return log_file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ecoli_serotyping/predictionFunctions.py	Fri Jan 05 15:43:14 2018 -0500
@@ -0,0 +1,231 @@
+#!/usr/bin/env python
+
+import json
+import logging
+import os
+from collections import defaultdict
+
+import pandas as pd
+
+LOG = logging.getLogger(__name__)
+
+"""
+    Serotype prediction for E. coli
+"""
+
+def predict_serotype(blast_output_file, ectyper_dict_file, predictions_file, detailed=False):
+    """Make serotype prediction for all genomes based on blast output
+    
+    Args:
+        blast_output_file(str):
+            blastn output with outfmt:
+                "6 qseqid qlen sseqid length pident sstart send sframe qcovhsp -word_size 11"
+    ectyper_dict_file(str):
+        mapping file used to associate allele id to allele informations
+    predictions_file(str):
+        csv file to store result
+    detailed(bool, optional):
+        whether to generate detailed output or not
+    
+    Returns:
+        predictions_file
+    """
+    basename, extension = os.path.splitext(predictions_file)
+    parsed_output_file = ''.join([basename, '_raw', extension])
+
+    LOG.info("Predicting serotype from blast output")
+    output_df = blast_output_to_df(blast_output_file)
+    ectyper_df = ectyper_dict_to_df(ectyper_dict_file)
+    # Merge output_df and ectyper_df
+    output_df = output_df.merge(ectyper_df, left_on='qseqid', right_on='name', how='left')
+    predictions_dict = {}
+    # Select individual genomes
+    output_df['genome_name'] = output_df['sseqid'].str.split('|').str[1]
+    # Initialize constants
+    gene_pairs = {'wzx':'wzy', 'wzy':'wzx', 'wzm':'wzt', 'wzt':'wzm'}
+    predictions_columns = ['O_prediction', 'O_info', 'H_prediction', 'H_info']
+    gene_list = ['wzx', 'wzy', 'wzm', 'wzt', 'fliC', 'fllA', 'flkA', 'flmA', 'flnA']
+    if detailed:
+        # Add gene lists for detailed output report
+        for gene in gene_list:
+            predictions_columns.append(gene)
+    for genome_name, per_genome_df in output_df.groupby('genome_name'):
+        # Make prediction for each genome based on blast output
+        predictions_dict[genome_name] = get_prediction(
+            per_genome_df, predictions_columns, gene_pairs, detailed)
+    predictions_df = pd.DataFrame(predictions_dict).transpose()
+    if predictions_df.empty:
+        predictions_df = pd.DataFrame(columns=predictions_columns)
+    predictions_df = predictions_df[predictions_columns]
+    store_df(output_df, parsed_output_file)
+    store_df(predictions_df, predictions_file)
+    LOG.info("Serotype prediction completed")
+    return predictions_file
+
+def get_prediction(per_genome_df, predictions_columns, gene_pairs, detailed, ):
+    """Make serotype prediction for single genomes based on blast output
+    
+    Args:
+        per_genome_df(DataFrame):
+            blastn outputs for the given genome
+        predictions_columns(dict):
+            columns to be filled by prediction function
+        gene_pairs(dict):
+            dict of pair genes used for paired logic
+        detailed(bool):
+            whether to generate detailed output or not
+    Return:
+        Prediction dictionary for the given genome
+    """
+    # Extract potential predictors
+    useful_columns = [
+        'gene', 'serotype', 'score', 'name', 'desc', 'pident', 'qcovhsp', 'qseqid', 'sseqid'
+    ]
+    per_genome_df = per_genome_df.sort_values(['gene', 'serotype', 'score'], ascending=False)
+    per_genome_df = per_genome_df[~per_genome_df.duplicated(['gene', 'serotype'])]
+    predictors_df = per_genome_df[useful_columns]
+    predictors_df = predictors_df.sort_values('score', ascending=False)
+    predictions = {}
+    for column in predictions_columns:
+        predictions[column] = None
+    for predicting_antigen in ['O', 'H']:
+        genes_pool = defaultdict(list)
+        for index, row in predictors_df.iterrows():
+            gene = row['gene']
+            if detailed:
+                predictions[gene] = True
+            if not predictions[predicting_antigen+'_prediction']:
+                serotype = row['serotype']
+                if serotype[0] is not predicting_antigen:
+                    continue
+                genes_pool[gene].append(serotype)
+                prediction = None
+                if len(serotype) < 1:
+                    continue
+                antigen = serotype[0].upper()
+                if antigen != predicting_antigen:
+                    continue
+                if gene in gene_pairs.keys():
+                    predictions[antigen+'_info'] = 'Only unpaired alignments found'
+                    # Pair gene logic
+                    potential_pairs = genes_pool.get(gene_pairs.get(gene))
+                    if potential_pairs is None:
+                        continue
+                    if serotype in potential_pairs:
+                        prediction = serotype
+                else:
+                    # Normal logic
+                    prediction = serotype
+                if prediction is None:
+                    continue
+                predictions[antigen+'_info'] = 'Alignment found'
+                predictions[predicting_antigen+'_prediction'] = prediction
+        # No alignment found
+        ## Make prediction based on non-paired gene
+        ##   if only one non-paired gene is avaliable
+        if predictions.get(predicting_antigen+'_prediction') is None:
+            if len(genes_pool) == 1:
+                serotypes = list(genes_pool.values())[0]
+                if len(serotypes) == 1:
+                    predictions[antigen+'_info'] = 'Lone unpaired alignment found'
+                    predictions[predicting_antigen+'_prediction'] = serotypes[0]
+    return predictions
+
+def blast_output_to_df(blast_output_file):
+    '''Convert raw blast output file to DataFrame
+    Args:
+        blast_output_file(str): location of blast output
+    
+    Returns:
+        DataFrame:
+            DataFrame that contains all informations from blast output
+    '''
+    # Load blast output
+    output_data = []
+    with open(blast_output_file, 'r') as fh:
+        for line in fh:
+            fields = line.strip().split()
+            entry = {
+                'qseqid': fields[0],
+                'qlen': fields[1],
+                'sseqid': fields[2],
+                'length': fields[3],
+                'pident': fields[4],
+                'sstart': fields[5],
+                'send': fields[6],
+                'sframe': fields[7],
+                'qcovhsp': fields[8]
+            }
+            output_data.append(entry)
+    df = pd.DataFrame(output_data)
+    if not output_data:
+        LOG.info("No hit found for this blast query")
+        # Return empty dataframe with correct columns
+        return pd.DataFrame(
+            columns=[
+                'length', 'pident', 'qcovhsp',
+                'qlen', 'qseqid', 'send',
+                'sframe', 'sseqid', 'sstart'
+            ])
+    df['score'] = df['pident'].astype(float)*df['qcovhsp'].astype(float)/10000
+    return df
+
+def ectyper_dict_to_df(ectyper_dict_file):
+    # Read ectyper dict
+    with open(ectyper_dict_file) as fh:
+        ectyper_dict = json.load(fh)
+        temp_list = []
+        for antigen, alleles in ectyper_dict.items():
+            for name, allele in alleles.items():
+                new_entry = {
+                    'serotype': allele.get('allele'),
+                    'name': name,
+                    'gene': allele.get('gene'),
+                    'desc': allele.get('desc')
+                }
+                temp_list.append(new_entry)
+        df = pd.DataFrame(temp_list)
+        return df
+
+def store_df(src_df, dst_file):
+    """Append dataframe to a file if it exists, otherwise, make a new file
+    Args:
+        src_df(str): dataframe object to be stored
+        dst_file(str): dst_file to be modified/created
+    """
+    if os.path.isfile(dst_file):
+        with open(dst_file, 'a') as fh:
+            src_df.to_csv(fh, header=False)
+    else:
+        with open(dst_file, 'w') as fh:
+            src_df.to_csv(fh, header=True, index_label='genome')
+
+def report_result(csv_file):
+    '''Report the content of dataframe in log
+    
+    Args:
+        csv_file(str): location of the prediction file
+    '''
+    df = pd.read_csv(csv_file)
+    if df.empty:
+        LOG.info('No prediction was made becuase no alignment was found')
+        return
+    LOG.info('\n{0}'.format(df.to_string(index=False)))
+
+def add_non_predicted(all_genomes_list, predictions_file):
+    '''Add genomes that do not show up in blast result to prediction file
+    
+    Args:
+        all_genome_list(list):
+            list of genomes from user input
+        predictions_file(str):
+            location of the prediction file
+    Returns:
+        str: location of the prediction file
+    '''
+    df = pd.read_csv(predictions_file)
+    df = df.merge(pd.DataFrame(all_genomes_list, columns=['genome']), on='genome', how='outer')
+    df.fillna({'O_info':'No alignment found', 'H_info':'No alignment found'}, inplace=True)
+    df.fillna('-', inplace=True)
+    df.to_csv(predictions_file, index=False)
+    return predictions_file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ecoli_serotyping/speciesIdentification.py	Fri Jan 05 15:43:14 2018 -0500
@@ -0,0 +1,152 @@
+import logging
+import multiprocessing
+import os
+import tempfile
+from Bio import SeqIO
+
+from ectyper import genomeFunctions, blastFunctions, definitions, subprocess_util
+
+LOG = logging.getLogger(__name__)
+
+def is_ecoli_genome(iden_file, genome_file=None, mash=False):
+    '''
+    Return True if file is classified as ecoli by ecoli markers, otherwise False
+
+    Args:
+        iden_file (str): path to valid fasta genome file
+        genome_file (str): Optional path to valid fastq file for reads
+        mash (bool): Optional input to decide whether to use mash if genome is
+                     identified as non-ecoli
+
+    Returns:
+        bool: True if iden_file is ecoli, False otherwise
+    '''
+    if genome_file is None:
+        genome_file = iden_file
+    num_hit = get_num_hits(iden_file)
+    if num_hit < 3:
+        LOG.info(
+            "{0} is identified as "
+            "an invalid e.coli genome file "
+            "by marker approach".format(os.path.basename(iden_file)))
+        if mash:
+            species = get_species(genome_file)
+            LOG.info(
+                "{0} is identified as genome of "
+                "{1} by mash approach".format(os.path.basename(iden_file), species))
+        return False
+    LOG.debug("{0} is a valid e.coli genome file".format(os.path.basename(iden_file)))
+    return True
+
+def get_num_hits(target):
+    '''
+    Return number of matching hits when query the reference genome
+        on the target genome
+
+    Args:
+        target (str): target genome
+
+    Returns:
+        int: number of hits found
+    '''
+    num_hit = 0
+    try:
+        with tempfile.TemporaryDirectory() as temp_dir:
+            blast_db = blastFunctions.create_blast_db([target], temp_dir)
+            result = blastFunctions.run_blast_for_identification(
+                definitions.ECOLI_MARKERS,
+                blast_db
+            )
+            with open(result) as handler:
+                LOG.debug("get_num_hits() output:")
+                for line in handler:
+                    LOG.debug(line)
+                    num_hit += 1
+            LOG.debug("{0} aligned to {1} marker sequences".format(target, num_hit))
+    except SystemExit:
+        pass
+    return num_hit
+
+def get_species(file):
+    '''
+    Given a fasta/fastq file, return the most likely species identification
+
+    Args:
+        file (str): fasta/fastq file input
+
+    Returns:
+        str: name of estimated species
+    '''
+    LOG.info("Identifying species for {0}".format(file))
+    if not os.path.isfile(definitions.REFSEQ_SKETCH):
+        LOG.warning("No refseq found.")
+        return None
+    species = 'unknown'
+    if genomeFunctions.get_valid_format(file) == 'fasta':
+        tmp_file = tempfile.NamedTemporaryFile().name
+        basename = os.path.basename(file).replace(' ', '_')
+        with open(tmp_file, 'w') as new_fh:
+            header = '> {0}\n'.format(basename)
+            new_fh.write(header)
+            for record in SeqIO.parse(file, 'fasta'):
+                new_fh.write(str(record.seq))
+                new_fh.write('nnnnnnnnnnnnnnnnnnnn')
+        try:
+            species = get_species_helper(tmp_file)
+        except Exception:
+            pass
+    if genomeFunctions.get_valid_format(file) == 'fastq':
+        species = get_species_helper(file)
+    return species
+
+def get_species_helper(file):
+    '''
+    Given a fasta/fastq file with one sequence, return the most likely species
+    identification
+
+    Args:
+        file (str): fasta/fastq file input
+
+    Returns:
+        str: name of estimated species
+    '''
+    species = 'unknown'
+    cmd = [
+        'mash', 'dist',
+        file,
+        definitions.REFSEQ_SKETCH,
+        '|',
+        'sort -gk3 -',
+        '|',
+        'head -1 -'
+    ]
+    try:
+        mash_output = subprocess_util.run_subprocess(' '.join(cmd), is_shell=True)
+        ass_acc_num = '_'.join(mash_output.split('\t')[1].split('_')[:2])
+        cmd = [
+            'grep -E',
+            ass_acc_num,
+            definitions.REFSEQ_SUMMARY
+        ]
+        summary_output = subprocess_util.run_subprocess(' '.join(cmd), is_shell=True)
+        species = summary_output.split('\t')[7]
+        return species
+    except Exception as err:
+        LOG.warning('No matching species found with distance estimation:{0}'.format(err))
+        try:
+            cmd = [
+                'mash screen',
+                '-w',
+                '-p', str(multiprocessing.cpu_count()//2),
+                definitions.REFSEQ_SKETCH,
+                file,
+                '| sort -gr - | head -1 -'
+            ]
+            screen_output = subprocess_util.run_subprocess(' '.join(cmd), is_shell=True)
+            LOG.debug(screen_output.split('\t'))
+            species = screen_output.split('\t')[5].split('\n')[0]
+        except Exception as err2:
+            LOG.warning(
+                'No matching species found with distance screening either:{}'.format(err2)
+            )
+    return species
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ecoli_serotyping/subprocess_util.py	Fri Jan 05 15:43:14 2018 -0500
@@ -0,0 +1,34 @@
+'''
+Utilities
+'''
+import logging
+import subprocess
+import timeit
+
+LOG = logging.getLogger(__name__)
+
+def run_subprocess(cmd, is_shell=False):
+    '''
+    Run cmd command on subprocess
+
+    Args:
+        cmd (str): cmd command
+
+    Returns:
+        stdout (str): The stdout of cmd
+    '''
+    start_time = timeit.default_timer()
+    LOG.debug("Running: {0}".format(cmd))
+    comp_proc = subprocess.run(
+        cmd,
+        shell=is_shell,
+        universal_newlines=True,
+        check=True,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE
+    )
+    stderr = comp_proc.stderr
+    stdout = comp_proc.stdout
+    elapsed_time = timeit.default_timer() - start_time
+    LOG.debug("Subprocess finished successfully in {:0.3f} sec.".format(elapsed_time))
+    return stdout