Mercurial > repos > pjbriggs > pal_finder

diff pal_filter.py @ 4:cb56cc1d5c39 draft
Updates to the palfilter.py utility.
author: pjbriggs
date: Mon, 21 Mar 2016 06:52:43 -0400
parents: e1a14ed7a9d6
children: 8159dab5dbdb
--- a/pal_filter.py	Wed Feb 24 08:25:17 2016 -0500
+++ b/pal_filter.py	Mon Mar 21 06:52:43 2016 -0400
@@ -1,87 +1,140 @@
 #!/usr/bin/python -tt
-#
-# pal_filter
-# https://github.com/graemefox/pal_filter
-#
-################################################################################
-# Graeme Fox - 15/02/2016 - graeme.fox@manchester.ac.uk
-# Tested on 64-bit Ubuntu, with Python 2.7
-#
-################################################################################
-# PROGRAM DESCRIPTION
-#
-# Program to pick optimum loci from the output of pal_finder_v0.02.04
-#
-# This program can be used to filter output from pal_finder and choose the
-# 'optimum' loci.
-#
-# For the paper referncing this workflow, see Griffiths et al.
-# (unpublished as of 15/02/2016) (sarah.griffiths-5@postgrad.manchester.ac.uk)
-#
-################################################################################
-#
-# This program also contains a quality-check method to improve the rate of PCR
-# success. For this QC method, paired end reads are assembled using
-# PANDAseq so you must have PANDAseq installed.
-#
-# For the paper referencing this assembly-QC method see Fox et al.
-# (unpublished as of 15/02/2016) (graeme.fox@manchester.ac.uk)
-#
-# For best results in PCR for marker development, I suggest enabling all the
-# filter options AND the assembly based QC
-#
-################################################################################
-# REQUIREMENTS
-#
-# Must have Biopython installed (www.biopython.org).
-#
-# If you with to perform the assembly QC step, you must have:
-# PandaSeq (https://github.com/neufeld/pandaseq)
-# PandaSeq must be in your $PATH / able to run from anywhere
-################################################################################
-# REQUIRED OPTIONS
-#
-# -i forward_paired_ends.fastQ
-# -j reverse_paired_ends.fastQ
-# -p pal_finder output - the "(microsatellites with read IDs and
-# primer pairs)" file
-#
-# By default the program does nothing....
-#
-# NON-REQUIRED OPTIONS
-#
-# -assembly:  turn on the pandaseq assembly QC step
-# -primers:  filter microsatellite loci to just those which
-# have primers designed
-#
-# -occurrences:  filter microsatellite loci to those with primers
-# which appear only once in the dataset
-#
-# -rankmotifs:  filter microsatellite loci to just those with perfect motifs.
-# Rank the output by size of motif (largest first)
-#
-###########################################################
-# For repeat analysis, the following extra non-required options may be useful:
-#
-# Since PandaSeq Assembly, and fastq -> fasta conversion are slow, do them the
-# first time, generate the files and then skip either, or both steps with
-# the following:
-#
-# -a:  skip assembly step
-# -c:  skip fastq -> fasta conversion step
-#
-# Just make sure to keep the assembled/converted files in the correct directory
-# with the correct filename(s)
-###########################################################
-#
-# EXAMPLE USAGE:
-#
-# pal_filtery.py -i R1.fastq -j R2.fastq
-# -p pal_finder_output.tabular -primers -occurrences -rankmotifs -assembly
-#
-###########################################################
+"""
+pal_filter
+https://github.com/graemefox/pal_filter
+
+Graeme Fox - 03/03/2016 - graeme.fox@manchester.ac.uk
+Tested on 64-bit Ubuntu, with Python 2.7
+
+~~~~~~~~~~~~~~~~~~~
+PROGRAM DESCRIPTION
+
+Program to pick optimum loci from the output of pal_finder_v0.02.04
+
+This program can be used to filter output from pal_finder and choose the
+'optimum' loci.
+
+For the paper referncing this workflow, see Griffiths et al.
+(unpublished as of 15/02/2016) (sarah.griffiths-5@postgrad.manchester.ac.uk)
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+This program also contains a quality-check method to improve the rate of PCR
+success. For this QC method, paired end reads are assembled using
+PANDAseq so you must have PANDAseq installed.
+
+For the paper referencing this assembly-QC method see Fox et al.
+(unpublished as of 15/02/2016) (graeme.fox@manchester.ac.uk)
+
+For best results in PCR for marker development, I suggest enabling all the
+filter options AND the assembly based QC
+
+~~~~~~~~~~~~
+REQUIREMENTS
+
+Must have Biopython installed (www.biopython.org).
+
+If you with to perform the assembly QC step, you must have:
+PandaSeq (https://github.com/neufeld/pandaseq)
+PandaSeq must be in your $PATH / able to run from anywhere
+
+~~~~~~~~~~~~~~~~
+REQUIRED OPTIONS
+
+-i forward_paired_ends.fastQ
+-j reverse_paired_ends.fastQ
+-p pal_finder output - by default pal_finder names this "x_PAL_summary.txt"
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+BY DEFAULT THIS PROGRAM DOES NOTHING. ENABLE SOME OF THE OPTIONS BELOW.
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+NON-REQUIRED OPTIONS
+
+-assembly:  turn on the pandaseq assembly QC step
+
+-primers:  filter microsatellite loci to just those which have primers designed
+
+-occurrences:  filter microsatellite loci to those with primers
+ which appear only once in the dataset
+
+-rankmotifs:  filter microsatellite loci to just those with perfect motifs.
+ Rank the output by size of motif (largest first)
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+For repeat analysis, the following extra non-required options may be useful:
+
+Since PandaSeq Assembly, and fastq -> fasta conversion are slow, do them the
+first time, generate the files and then skip either, or both steps with
+the following:
+
+-a:  skip assembly step
+-c:  skip fastq -> fasta conversion step
+
+Just make sure to keep the assembled/converted files in the correct directory
+with the correct filename(s)
+
+~~~~~~~~~~~~~~~
+EXAMPLE USAGE:
+
+pal_filtery.py -i R1.fastq -j R2.fastq
+ -p pal_finder_output.tabular -primers -occurrences -rankmotifs -assembly
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+"""
 import Bio, subprocess, argparse, csv, os, re, time
 from Bio import SeqIO
+__version__ = "1.0.0"
+############################################################
+# Function List                                            #
+############################################################
+def ReverseComplement1(seq):
+    """
+    take a nucleotide sequence and reverse-complement it
+    """
+    seq_dict = {'A':'T','T':'A','G':'C','C':'G'}
+    return "".join([seq_dict[base] for base in reversed(seq)])
+
+def fastq_to_fasta(input_file, wanted_set):
+    """
+    take a file in fastq format, convert to fasta format and filter on
+    the set of sequences that we want to keep
+    """
+    file_name = os.path.splitext(os.path.basename(input_file))[0]
+    with open(file_name + "_filtered.fasta", "w") as out:
+        for record in SeqIO.parse(input_file, "fastq"):
+            ID = str(record.id)
+            SEQ = str(record.seq)
+            if ID in wanted_set:
+                out.write(">" + ID + "\n" + SEQ + "\n")
+
+def strip_barcodes(input_file, wanted_set):
+    """
+    take fastq data containing sequencing barcodes and strip the barcode
+    from each sequence. Filter on the set of sequences that we want to keep
+    """
+    file_name = os.path.splitext(os.path.basename(input_file))[0]
+    with open(file_name + "_adapters_removed.fasta", "w") as out:
+        for record in SeqIO.parse(input_file, "fasta"):
+            match = re.search(r'\S*:', record.id)
+            if match:
+                correct = match.group().rstrip(":")
+            else:
+                correct = str(record.id)
+            SEQ = str(record.seq)
+            if correct in wanted_set:
+                out.write(">" + correct + "\n" + SEQ + "\n")
+
+############################################################
+# MAIN PROGRAM                                             #
+############################################################
+print "\n~~~~~~~~~~"
+print "pal_filter"
+print "~~~~~~~~~~"
+print "Version: " + __version__
+time.sleep(1)
+print "\nFind the optimum loci in your pal_finder output and increase "\
+                    "the rate of successful microsatellite marker development"
+print "\nSee Griffiths et al. (currently unpublished) for more details......"
+time.sleep(2)
 
 # Get values for all the required and optional arguments
 
@@ -95,257 +148,188 @@
 parser.add_argument('-p','--pal_finder', help='Output from pal_finder ', \
                     required=True)
 
-parser.add_argument('-assembly','--assembly_QC', help='Perform the PandaSeq \
-                    based QC', nargs='?', const=1, type=int, required=False)
+parser.add_argument('-assembly', help='Perform the PandaSeq based QC', \
+                    action='store_true')
 
 parser.add_argument('-a','--skip_assembly', help='If the assembly has already \
-                    been run, skip it with -a', nargs='?', const=1, \
-                    type=int, required=False)
+                    been run, skip it with -a', action='store_true')
 
 parser.add_argument('-c','--skip_conversion', help='If the fastq to fasta \
                     conversion has already been run, skip it with -c', \
-                    nargs='?', const=1, type=int, required=False)
+                    action='store_true')
 
-parser.add_argument('-primers','--filter_by_primer_column', help='Filter \
+parser.add_argument('-primers', help='Filter \
                     pal_finder output to just those loci which have primers \
-                    designed', nargs='?', const=1, type=int, required=False)
+                    designed', action='store_true')
 
-parser.add_argument('-occurrences','--filter_by_occurrences_column', \
+parser.add_argument('-occurrences', \
                     help='Filter pal_finder output to just loci with primers \
-                    which only occur once in the dataset', nargs='?', \
-                    const=1, type=int, required=False)
+                    which only occur once in the dataset', action='store_true')
 
-parser.add_argument('-rankmotifs','--filter_and_rank_by_motif_size', \
+parser.add_argument('-rankmotifs', \
                     help='Filter pal_finder output to just loci which are a \
                     perfect repeat unit. Also, rank the loci by motif size \
-                    (largest first)', nargs='?', const=1, type=int, \
-                    required=False)
-
-args = vars(parser.parse_args())
-
-# parse arguments
-
-R1_input = args['input1'];
-R2_input = args['input2'];
-pal_finder_output = args['pal_finder'];
-perform_assembly = args['assembly_QC']
-skip_assembly = args['skip_assembly'];
-skip_conversion = args['skip_conversion'];
-filter_primers = args['filter_by_primer_column'];
-filter_occurrences = args['filter_by_occurrences_column'];
-filter_rank_motifs = args['filter_and_rank_by_motif_size'];
+                    (largest first)', action='store_true')
 
-# set default values for arguments
-if perform_assembly is None:
-    perform_assembly = 0
-if skip_assembly is None:
-    skip_assembly = 0
-if skip_conversion is None:
-    skip_conversion = 0
-if filter_primers is None:
-    filter_primers = 0
-if filter_occurrences is None:
-    filter_occurrences = 0
-if filter_rank_motifs is None:
-    filter_rank_motifs = 0
-
-############################################################
-# Function List                                            #
-############################################################
-
-# Reverse complement a sequence
-
-def ReverseComplement1(seq):
-    seq_dict = {'A':'T','T':'A','G':'C','C':'G'}
-    return "".join([seq_dict[base] for base in reversed(seq)])
+parser.add_argument('-v', '--get_version', help='Print the version number of \
+                    this pal_filter script', action='store_true')
 
-# Convert a .fastq to a .fasta, filter to just lines we want
-# and strip MiSeq barcodes
-
-def fastq_to_fasta(file):
-    file_name = os.path.splitext(os.path.basename(file))[0]
-    with open(file_name + "_filtered.fasta", "w") as out:
-        for record in SeqIO.parse(file, "fastq"):
-            ID = str(record.id)
-            SEQ = str(record.seq)
-            if ID in wanted:
-                out.write(">" + ID + "\n" + SEQ + "\n")
-
-# strip the miseq barcodes from a fasta file
+args = parser.parse_args()
 
-def strip_barcodes(file):
-    file_name = os.path.splitext(os.path.basename(file))[0]
-    with open(file_name + "_adapters_removed.fasta", "w") as out:
-        for record in SeqIO.parse(file, "fasta"):
-            match = re.search(r'\S*:', record.id)
-            if match:
-                correct = match.group().rstrip(":")
-            else:
-                correct = str(record.id)
-            SEQ = str(record.seq)
-            if correct in wanted:
-                out.write(">" + correct + "\n" + SEQ + "\n")
-
-############################################################
-# MAIN PROGRAM                                             #
-############################################################
-print "\n~~~~~~~~~~"
-print "pal_filter"
-print "~~~~~~~~~~\n"
-time.sleep(1)
-print "\"Find the optimum loci in your pal_finder output and increase "\
-                    "the rate of successful microsatellite marker development\""
-print "\nSee Griffiths et al. (currently unpublished) for more details......"
-time.sleep(2)
-
-if (perform_assembly == 0 and filter_primers == 0 and filter_occurrences == 0 \
-                    and filter_rank_motifs == 0):
+if not args.assembly and not args.primers and not args.occurrences \
+        and not args.rankmotifs:
     print "\nNo optional arguments supplied."
     print "\nBy default this program does nothing."
     print "\nNo files produced and no modifications made."
     print "\nFinished.\n"
     exit()
-
 else:
     print "\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
     print "Checking supplied filtering parameters:"
     print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
     time.sleep(2)
-    if filter_primers == 1:
+    if args.get_version:
+        print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
+        print "pal_filter version is " + __version__ + " (03/03/2016)"
+        print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"
+    if args.primers:
         print "-primers flag supplied."
         print "Filtering pal_finder output on the \"Primers found (1=y,0=n)\"" \
                     " column."
         print "Only rows where primers have successfully been designed will"\
                     " pass the filter.\n"
         time.sleep(2)
-    if filter_occurrences == 1:
+    if args.occurrences:
         print "-occurrences flag supplied."
-        print "Filtering pal_finder output on the \"Occurences of Forward" \
-                    " Primer in Reads\" and \"Occurences of Reverse Primer" \
+        print "Filtering pal_finder output on the \"Occurrences of Forward" \
+                    " Primer in Reads\" and \"Occurrences of Reverse Primer" \
                     " in Reads\" columns."
         print "Only rows where both primers occur only a single time in the"\
                     " reads pass the filter.\n"
         time.sleep(2)
-    if filter_rank_motifs == 1:
+    if args.rankmotifs:
         print "-rankmotifs flag supplied."
         print "Filtering pal_finder output on the \"Motifs(bases)\" column to" \
                     " just those with perfect repeats."
         print "Only rows containing 'perfect' repeats will pass the filter."
         print "Also, ranking output by size of motif (largest first).\n"
         time.sleep(2)
+
 # index the raw fastq files so that the sequences can be pulled out and
 # added to the filtered output file
-
-# Indexing input sequence files
 print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
 print "Indexing FastQ files....."
 print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
-R1fastq_sequences_index = SeqIO.index(R1_input,'fastq')
-R2fastq_sequences_index = SeqIO.index(R2_input,'fastq')
+R1fastq_sequences_index = SeqIO.index(args.input1,'fastq')
+R2fastq_sequences_index = SeqIO.index(args.input2,'fastq')
 print "Indexing complete."
 
 # create a set to hold the filtered output
 wanted_lines = set()
 
 # get lines from the pal_finder output which meet filter settings
-
 # read the pal_finder output file into a csv reader
-with open (pal_finder_output) as csvfile_infile:
+with open (args.pal_finder) as csvfile_infile:
     csv_f = csv.reader(csvfile_infile, delimiter='\t')
-    header =  csv_f.next()
-    with open(os.path.splitext(os.path.basename(pal_finder_output))[0] + \
-                    ".filtered", 'w') as csvfile_outfile:
-# write the header line for the output file
-        csvfile_outfile.write('\t'.join(header) + "\tR1_Sequence_ID\t"
-                              "R1_Sequence\tR2_Sequence_ID\tR2_Sequence\n")
+    header = csv_f.next()
+    header.extend(("R1_Sequence_ID", \
+                   "R1_Sequence", \
+                   "R2_Sequence_ID", \
+                   "R2_Sequence" + "\n"))
+    with open( \
+            os.path.splitext(os.path.basename(args.pal_finder))[0] + \
+            ".filtered", 'w') as csvfile_outfile:
+        # write the header line for the output file
+        csvfile_outfile.write('\t'.join(header))
         for row in csv_f:
-# get the sequence ID
+            # get the sequence ID
             seq_ID = row[0]
-# get the raw sequence reads and convert to a format that can
-# go into a tsv file
+            # get the raw sequence reads and convert to a format that can
+            # go into a tsv file
             R1_sequence = R1fastq_sequences_index[seq_ID].format("fasta").\
                     replace("\n","\t",1).replace("\n","")
             R2_sequence = R2fastq_sequences_index[seq_ID].format("fasta").\
                     replace("\n","\t",1).replace("\n","")
             seq_info = "\t" + R1_sequence + "\t" + R2_sequence + "\n"
-# navigate through all different combinations of filter options
-# if the primer filter is switched on
-            if filter_primers == 1:
-# check the occurrences of primers field
+            # navigate through all different combinations of filter options
+            # if the primer filter is switched on
+            if args.primers:
+                # check the occurrences of primers field
                 if row[5] == "1":
-# if filter occurrences of primers is switched on
-                    if filter_occurrences == 1:
-# check the occurrences of primers field
+                    # if filter occurrences of primers is switched on
+                    if args.occurrences:
+                        # check the occurrences of primers field
                         if (row[15] == "1" and row[16] == "1"):
-# if rank by motif is switched on
-                            if filter_rank_motifs == 1:
-# check for perfect motifs
+                            # if rank by motif is switched on
+                            if args.rankmotifs:
+                                # check for perfect motifs
                                 if row[1].count('(') == 1:
-# all 3 filter switched on
-# write line out to output
+                                    # all 3 filter switched on
+                                    # write line out to output
                                     csvfile_outfile.write('\t'.join(row) + \
                                                         seq_info)
-
                             else:
                                 csvfile_outfile.write('\t'.join(row) + seq_info)
-                    elif filter_rank_motifs == 1:
+                    elif args.rankmotifs:
                         if row[1].count('(') == 1:
                             csvfile_outfile.write('\t'.join(row) + seq_info)
                     else:
                         csvfile_outfile.write('\t'.join(row) + seq_info)
-            elif filter_occurrences == 1:
+            elif args.occurrences:
                 if (row[15] == "1" and row[16] == "1"):
-                    if filter_rank_motifs == 1:
+                    if args.rankmotifs:
                         if row[1].count('(') == 1:
                             csvfile_outfile.write('\t'.join(row) + seq_info)
                     else:
                         csvfile_outfile.write('\t'.join(row) + seq_info)
-            elif filter_rank_motifs == 1:
+            elif args.rankmotifs:
                 if row[1].count('(') == 1:
                     csvfile_outfile.write('\t'.join(row) + seq_info)
             else:
                 csvfile_outfile.write('\t'.join(row) + seq_info)
 
 # if filter_rank_motifs is active, order the file by the size of the motif
-if filter_rank_motifs == 1:
+if args.rankmotifs:
     rank_motif = []
     ranked_list = []
 # read in the non-ordered file and add every entry to rank_motif list
-    with open(os.path.splitext(os.path.basename(pal_finder_output))[0] + \
-                        ".filtered") as csvfile_ranksize:
+    with open( \
+            os.path.splitext(os.path.basename(args.pal_finder))[0] + \
+            ".filtered") as csvfile_ranksize:
         csv_rank = csv.reader(csvfile_ranksize, delimiter='\t')
         header = csv_rank.next()
         for line in csv_rank:
             rank_motif.append(line)
 
-# open the file ready to write the ordered list
-    with open(os.path.splitext(os.path.basename(pal_finder_output))[0] + \
-                        ".filtered", 'w') as rank_outfile:
+    # open the file ready to write the ordered list
+    with open( \
+            os.path.splitext(os.path.basename(args.pal_finder))[0] + \
+            ".filtered", 'w') as rank_outfile:
         rankwriter = csv.writer(rank_outfile, delimiter='\t', \
                         lineterminator='\n')
         rankwriter.writerow(header)
         count = 2
         while count < 10:
             for row in rank_motif:
-# count size of motif
+                # count size of motif
                 motif = re.search(r'[ATCG]*',row[1])
                 if motif:
                     the_motif = motif.group()
-# rank it and write into ranked_list
+                    # rank it and write into ranked_list
                     if len(the_motif) == count:
                         ranked_list.insert(0, row)
             count = count +  1
-# write out the ordered list, into the .filtered file
+        # write out the ordered list, into the .filtered file
         for row in ranked_list:
             rankwriter.writerow(row)
 
 print "\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
 print "Checking assembly flags supplied:"
 print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
-if perform_assembly == 0:
+if not args.assembly:
     print "Assembly flag not supplied. Not performing assembly QC.\n"
-if perform_assembly == 1:
-    print "-assembly flag supplied: Performing PandaSeq assembly quality checks."
+if args.assembly:
+    print "-assembly flag supplied: Perform PandaSeq assembly quality checks."
     print "See Fox et al. (currently unpublished) for full details on the"\
                         " quality-check process.\n"
     time.sleep(5)
@@ -355,8 +339,9 @@
     motif = []
     F_primers = []
     R_primers = []
-    with open(os.path.splitext(os.path.basename(pal_finder_output))[0] + \
-                    ".filtered") as input_csv:
+    with open( \
+            os.path.splitext(os.path.basename(args.pal_finder))[0] + \
+            ".filtered") as input_csv:
         pal_finder_csv = csv.reader(input_csv, delimiter='\t')
         header = pal_finder_csv.next()
         for row in pal_finder_csv:
@@ -369,132 +354,141 @@
     wanted = set()
     for line in seqIDs:
         wanted.add(line)
-
-# Assemble the paired end reads into overlapping contigs using PandaSeq
-# (can be skipped with the -a flag if assembly has already been run
-# and the appropriate files are in the same directory as the script,
-# and named "Assembly.fasta" and "Assembly_adapters_removed.fasta")
-#
-# The first time you riun the script you MUST not enable the -a flag.t
-# but you are able to skip the assembly in subsequent analysis using the
-# -a flag.
+    """
+    Assemble the paired end reads into overlapping contigs using PandaSeq
+    (can be skipped with the -a flag if assembly has already been run
+    and the appropriate files are in the same directory as the script,
+    and named "Assembly.fasta" and "Assembly_adapters_removed.fasta")
 
-    if skip_assembly == 0:
-        pandaseq_command = 'pandaseq -A pear -f ' + R1_input + ' -r ' + \
-                        R2_input + ' -o 25 -t 0.95 -w Assembly.fasta'
+    The first time you riun the script you MUST not enable the -a flag.t
+    but you are able to skip the assembly in subsequent analysis using the
+    -a flag.
+    """
+    if not args.skip_assembly:
+        pandaseq_command = 'pandaseq -A pear -f ' + args.input1 + ' -r ' + \
+                        args.input2 + ' -o 25 -t 0.95 -w Assembly.fasta'
         subprocess.call(pandaseq_command, shell=True)
-        strip_barcodes("Assembly.fasta")
+        strip_barcodes("Assembly.fasta", wanted)
         print "\nPaired end reads been assembled into overlapping reads."
-        print "\nFor future analysis, you can skip this assembly step using \
-                        the -a flag, provided that the assembly.fasta file is \
-                        intact and in the same location."
+        print "\nFor future analysis, you can skip this assembly step using" \
+                        " the -a flag, provided that the assembly.fasta file" \
+                        " is intact and in the same location."
     else:
         print "\n(Skipping the assembly step as you provided the -a flag)"
-
-# Fastq files need to be converted to fasta. The first time you run the script
-# you MUST not enable the -c flag, but you are able to skip the conversion
-# later using the -c flag. Make sure the fasta files are in the same location
-#and do not change the filenames
-
-    if skip_conversion ==0:
-        fastq_to_fasta(R1_input)
-        fastq_to_fasta(R2_input)
+    """
+    Fastq files need to be converted to fasta. The first time you run the script
+    you MUST not enable the -c flag, but you are able to skip the conversion
+    later using the -c flag. Make sure the fasta files are in the same location
+    and do not change the filenames
+    """
+    if not args.skip_conversion:
+        fastq_to_fasta(args.input1, wanted)
+        fastq_to_fasta(args.input2, wanted)
         print "\nThe input fastq files have been converted to the fasta format."
-        print "\nFor any future analysis, you can skip this conversion step \
-                        using the -c flag, provided that the fasta files \
-                        are intact and in the same location."
+        print "\nFor any future analysis, you can skip this conversion step" \
+                        " using the -c flag, provided that the fasta files" \
+                        " are intact and in the same location."
     else:
         print "\n(Skipping the fastq -> fasta conversion as you provided the" \
                         " -c flag).\n"
 
-# get the files and everything else we will need
-
-# Assembled fasta file
+    # get the files and everything else needed
+    # Assembled fasta file
     assembly_file = "Assembly_adapters_removed.fasta"
-# filtered R1 reads
-    R1_fasta = os.path.splitext(os.path.basename(R1_input))[0] + \
-                        "_filtered.fasta"
-# filtered R2 reads
-    R2_fasta = os.path.splitext(os.path.basename(R2_input))[0] + \
-                        "_filtered.fasta"
-
-    outputfilename = os.path.splitext(os.path.basename(R1_input))[0]
-
-# parse the files with SeqIO
+    # filtered R1 reads
+    R1_fasta = os.path.splitext( \
+        os.path.basename(args.input1))[0] + "_filtered.fasta"
+    # filtered R2 reads
+    R2_fasta = os.path.splitext( \
+        os.path.basename(args.input2))[0] + "_filtered.fasta"
+    outputfilename = os.path.splitext(os.path.basename(args.input1))[0]
+    # parse the files with SeqIO
     assembly_sequences = SeqIO.parse(assembly_file,'fasta')
     R1fasta_sequences = SeqIO.parse(R1_fasta,'fasta')
 
-# create some empty lists to hold the ID tags we are interested in
+    # create some empty lists to hold the ID tags we are interested in
     assembly_IDs = []
     fasta_IDs = []
 
-# populate the above lists with sequence IDs
+    # populate the above lists with sequence IDs
     for sequence in assembly_sequences:
         assembly_IDs.append(sequence.id)
     for sequence in R1fasta_sequences:
         fasta_IDs.append(sequence.id)
 
-# Index the assembly fasta file
+    # Index the assembly fasta file
     assembly_sequences_index = SeqIO.index(assembly_file,'fasta')
     R1fasta_sequences_index = SeqIO.index(R1_fasta,'fasta')
     R2fasta_sequences_index = SeqIO.index(R2_fasta,'fasta')
 
-# prepare the output file
-    with open (outputfilename + "_pal_filter_assembly_output.txt", 'w') \
-                    as outputfile:
-# write the headers for the output file
-        outputfile.write("readPairID\t Forward Primer\t F Primer Position in "
-                         "Assembled Read\t Reverse Primer\t R Primer Position in "
-                         "Assembled Read\t Motifs(bases)\t Assembled Read ID\t "
-                         "Assembled Read Sequence\t Raw Forward Read ID\t Raw "
-                         "Forward Read Sequence\t Raw Reverse Read ID\t Raw Reverse "
-                         "Read Sequence\n")
+    # prepare the output file
+    with open ( \
+            outputfilename + "_pal_filter_assembly_output.txt", 'w') \
+            as outputfile:
+        # write the headers for the output file
+        output_header = ("readPairID", \
+                    "Forward Primer",\
+                    "F Primer Position in Assembled Read", \
+                    "Reverse Primer", \
+                    "R Primer Position in Assembled Read", \
+                    "Motifs(bases)", \
+                    "Assembled Read ID", \
+                    "Assembled Read Sequence", \
+                    "Raw Forward Read ID", \
+                    "Raw Forward Read Sequence", \
+                    "Raw Reverse Read ID", \
+                    "Raw Reverse Read Sequence\n")
+        outputfile.write("\t".join(output_header))
 
-# cycle through parameters from the pal_finder output
+        # cycle through parameters from the pal_finder output
         for x, y, z, a in zip(seqIDs, F_primers, R_primers, motif):
             if str(x) in assembly_IDs:
             # get the raw sequences ready to go into the output file
                 assembly_seq = (assembly_sequences_index.get_raw(x).decode())
-# fasta entries need to be converted to single line so sit nicely in the output
-                assembly_output = assembly_seq.replace("\n","\t")
+                # fasta entries need to be converted to single line so sit
+                # nicely in the output
+                assembly_output = assembly_seq.replace("\n","\t").strip('\t')
                 R1_fasta_seq = (R1fasta_sequences_index.get_raw(x).decode())
                 R1_output = R1_fasta_seq.replace("\n","\t",1).replace("\n","")
                 R2_fasta_seq = (R2fasta_sequences_index.get_raw(x).decode())
                 R2_output = R2_fasta_seq.replace("\n","\t",1).replace("\n","")
                 assembly_no_id = '\n'.join(assembly_seq.split('\n')[1:])
 
-# check that both primer sequences can be seen in the assembled contig
+                # check that both primer sequences can be seen in the
+                # assembled contig
                 if y or ReverseComplement1(y) in assembly_no_id and z or \
                                     ReverseComplement1(z) in assembly_no_id:
                     if y in assembly_no_id:
-# get the positions of the primers in the assembly to predict fragment length
+                        # get the positions of the primers in the assembly
+                        # (can be used to predict fragment length)
                         F_position = assembly_no_id.index(y)+len(y)+1
                     if ReverseComplement1(y) in assembly_no_id:
-                        F_position = assembly_no_id.index(ReverseComplement1(y)\
-                                            )+len(ReverseComplement1(y))+1
+                        F_position = assembly_no_id.index( \
+                            ReverseComplement1(y))+len(ReverseComplement1(y))+1
                     if z in assembly_no_id:
                         R_position = assembly_no_id.index(z)+1
                     if ReverseComplement1(z) in assembly_no_id:
-                        R_position = assembly_no_id.index(ReverseComplement1(z)\
-                                            )+1
-
-# write everything out into the output file
-                    output = (str(x) + "\t" + y + "\t" + str(F_position) \
-                                        + "\t" + (z) + "\t" + str(R_position) \
-                                        + "\t" + a + "\t" + assembly_output \
-                                        + R1_output + "\t" + R2_output + "\n")
-                    outputfile.write(output)
+                        R_position = assembly_no_id.index( \
+                        ReverseComplement1(z))+1
+                    output = (str(x),
+                                str(y),
+                                str(F_position),
+                                str(z),
+                                str(R_position),
+                                str(a),
+                                str(assembly_output),
+                                str(R1_output),
+                                str(R2_output + "\n"))
+                    outputfile.write("\t".join(output))
         print "\nPANDAseq quality check complete."
         print "Results from PANDAseq quality check (and filtering, if any" \
                                 " any filters enabled) written to output file" \
-                                " ending \"_pal_filter_assembly_output.txt\".\n\n"
-
+                                " ending \"_pal_filter_assembly_output.txt\".\n"
     print "Filtering of pal_finder results complete."
     print "Filtered results written to output file ending \".filtered\"."
     print "\nFinished\n"
 else:
-    if (skip_assembly == 1 or skip_conversion == 1):
+    if args.skip_assembly or args.skip_conversion:
         print "\nERROR: You cannot supply the -a flag or the -c flag without \
                     also supplying the -assembly flag.\n"
-
         print "\nProgram Finished\n"
author	pjbriggs
date	Mon, 21 Mar 2016 06:52:43 -0400
parents	e1a14ed7a9d6
children	8159dab5dbdb