comparison reheader.py @ 0:331fd79a9341 draft

"planemo upload for repository https://github.com/QFAB-Bioinformatics/metaDEGalaxy/tree/master/reheader commit 6783cd68521863b34f8e77cbb7ba404700c72313-dirty"
author qfabrepo
date Thu, 03 Sep 2020 01:32:31 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:331fd79a9341
1 #!/usr/bin/env python
2 import sys
3 from Bio.Seq import Seq
4 from Bio import SeqIO
5 from Bio.SeqRecord import SeqRecord
6 from os.path import basename
7 import os
8 import re
9 import argparse
10
11 parser = argparse.ArgumentParser(
12 description="reformat the read name (header) by appending the sample name to the read name\n" +
13 "Example:\n python reheader.py -n F3D0_R1.fastq -i test-data/F3D0_R1.fastq -o test-data/test -l mylog -d test-data/")
14 parser.add_argument("-v","--version",action="version",version="%(prog)s 1.0")
15 parser.add_argument("-n","--samplename",dest="samplename",default=False,help="input sample name")
16 parser.add_argument("-i","--input",dest="inputfile",default=False,help="input filename in FASTQ format")
17 parser.add_argument("-l","--log", dest="logfile",default=False,help="output log file")
18 parser.add_argument("-o","--outfile",dest="outputfile",default=False,help="output filename")
19 parser.add_argument("-d","--outdir",dest="outputdir",default=False,help="output directory")
20
21
22 if(len(sys.argv) == 1):
23 parser.print_help(sys.stderr)
24 sys.exit()
25
26 args = parser.parse_args()
27
28
29 filename = args.samplename
30 infile = args.inputfile
31 str_to_add = os.path.splitext(basename(filename))[0]
32 outfile = args.outputfile
33 outdir = args.outputdir
34 logfile = args.logfile
35
36
37 rdict = {
38 '_R1': '/1',
39 '_R2': '/2',
40 '_1': '/1',
41 '_2': '/2',
42 }
43
44 rdict_remove = {
45 '_R1': '',
46 '_R2': '',
47 '_1': '',
48 '_2': '',
49 }
50
51 def makesubs(s):
52 for pattern, repl in rdict.items():
53 pat1 = pattern +'_?[A-Za-z0-9]+$'
54 pat2 = pattern
55 combined_pat = r'|'.join((pat1, pat2))
56 s = re.sub(combined_pat, repl,s)
57 return s
58
59 def makesubs_remove(s):
60 for pattern, repl in rdict_remove.items():
61 pat1 = pattern +'_?[A-Za-z0-9]+$'
62 pat2 = pattern
63 combined_pat = r'|'.join((pat1, pat2))
64 s = re.sub(combined_pat, repl,s)
65 return s
66
67 def appendStringToSequenceHeader(inputfile,header_to_add):
68 records=[]
69 for seq_record in SeqIO.parse(inputfile, "fastq"):
70 header =seq_record.id
71 header = "{0}".format(header) + "_" +header_to_add
72 record = SeqRecord(seq_record.seq,id=header,description="")
73 record.letter_annotations["phred_quality"]=seq_record.letter_annotations["phred_quality"]
74 records.append(record)
75 return records
76
77 str_to_search = makesubs_remove(str_to_add)
78 str_to_add = makesubs(str_to_add)
79 final_records=[]
80 outlogfile=open(os.path.join(outdir,logfile),"w")
81
82 final_records=appendStringToSequenceHeader(infile,str_to_add)
83 outlogfile.write(str_to_search)
84 SeqIO.write(final_records, outfile , "fastq")
85 outlogfile.close()