diff reheader.py @ 0:331fd79a9341 draft

"planemo upload for repository https://github.com/QFAB-Bioinformatics/metaDEGalaxy/tree/master/reheader commit 6783cd68521863b34f8e77cbb7ba404700c72313-dirty"
author qfabrepo
date Thu, 03 Sep 2020 01:32:31 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/reheader.py	Thu Sep 03 01:32:31 2020 +0000
@@ -0,0 +1,85 @@
+#!/usr/bin/env python 
+import sys
+from Bio.Seq import Seq
+from Bio import SeqIO
+from Bio.SeqRecord import SeqRecord
+from os.path import basename
+import os
+import re
+import argparse
+
+parser = argparse.ArgumentParser(
+    description="reformat the read name (header) by appending the sample name to the read name\n" + 
+				"Example:\n python reheader.py -n F3D0_R1.fastq -i test-data/F3D0_R1.fastq -o test-data/test -l mylog -d test-data/")
+parser.add_argument("-v","--version",action="version",version="%(prog)s 1.0")
+parser.add_argument("-n","--samplename",dest="samplename",default=False,help="input sample name")
+parser.add_argument("-i","--input",dest="inputfile",default=False,help="input filename in FASTQ format")
+parser.add_argument("-l","--log", dest="logfile",default=False,help="output log file")
+parser.add_argument("-o","--outfile",dest="outputfile",default=False,help="output filename")
+parser.add_argument("-d","--outdir",dest="outputdir",default=False,help="output directory")
+
+
+if(len(sys.argv) == 1):
+       parser.print_help(sys.stderr)
+       sys.exit()
+
+args = parser.parse_args()
+
+
+filename = args.samplename
+infile = args.inputfile
+str_to_add = os.path.splitext(basename(filename))[0]
+outfile = args.outputfile
+outdir = args.outputdir
+logfile = args.logfile
+
+
+rdict = {
+    '_R1': '/1',
+    '_R2': '/2',
+    '_1': '/1',
+    '_2': '/2',
+}
+
+rdict_remove = {
+    '_R1': '',
+    '_R2': '',
+    '_1': '',
+    '_2': '',
+}
+
+def makesubs(s):
+    for pattern, repl in rdict.items():
+        pat1 = pattern +'_?[A-Za-z0-9]+$'
+        pat2 = pattern
+        combined_pat = r'|'.join((pat1, pat2))
+        s = re.sub(combined_pat, repl,s)
+    return s
+
+def makesubs_remove(s):
+    for pattern, repl in rdict_remove.items():
+        pat1 = pattern +'_?[A-Za-z0-9]+$'
+        pat2 = pattern
+        combined_pat = r'|'.join((pat1, pat2))
+        s = re.sub(combined_pat, repl,s)
+    return s
+
+def appendStringToSequenceHeader(inputfile,header_to_add):
+    records=[]
+    for seq_record in SeqIO.parse(inputfile, "fastq"):
+        header =seq_record.id
+        header = "{0}".format(header) + "_" +header_to_add
+        record = SeqRecord(seq_record.seq,id=header,description="")
+        record.letter_annotations["phred_quality"]=seq_record.letter_annotations["phred_quality"]
+        records.append(record)
+    return records
+
+str_to_search = makesubs_remove(str_to_add)
+str_to_add = makesubs(str_to_add)
+final_records=[]
+outlogfile=open(os.path.join(outdir,logfile),"w")
+
+final_records=appendStringToSequenceHeader(infile,str_to_add)
+outlogfile.write(str_to_search)
+SeqIO.write(final_records, outfile , "fastq")
+outlogfile.close()