0
|
1 import sys
|
|
2 import pandas as pd
|
|
3
|
|
4 def main():
|
|
5 patients = {}
|
|
6 files = []
|
|
7 sample_id = sys.argv[1]
|
|
8 imgt_files = 0
|
|
9 blast_files = 0
|
|
10 #organize files
|
|
11 for arg in sys.argv[2:-2]:
|
|
12 if arg.find("/") is -1:
|
|
13 patients[sample_id] = files
|
|
14 files = []
|
|
15 sample_id = arg
|
|
16 else:
|
|
17 df = pd.read_csv(arg, sep="\t", dtype=object, error_bad_lines=False)
|
|
18 if "Functionality" in list(df.columns.values):
|
|
19 df["VDJ Frame"][df["Functionality"] != "productive"] = "In-frame with stop codon"
|
|
20 imgt_files += 1
|
|
21 else:
|
|
22 blast_files += 1
|
|
23 files.append(df)
|
|
24 patients[sample_id] = files
|
|
25 columns = [u'ID', u'VDJ Frame', u'Top V Gene', u'Top D Gene', u'Top J Gene', u'CDR1 Seq', u'CDR1 Length', u'CDR2 Seq', u'CDR2 Length',
|
|
26 u'CDR3 Seq', u'CDR3 Length', u'CDR3 Seq DNA', u'CDR3 Length DNA', u'Strand', u'CDR3 Found How', u'Functionality', 'V-REGION identity %',
|
|
27 'V-REGION identity nt', 'D-REGION reading frame', 'AA JUNCTION', 'Functionality comment', 'Sequence', 'FR1-IMGT', 'FR2-IMGT',
|
|
28 'FR3-IMGT', 'CDR3-IMGT', 'JUNCTION', 'J-REGION', 'FR4-IMGT', 'P3V-nt nb', 'N1-REGION-nt nb', 'P5D-nt nb', 'P3D-nt nb', 'N2-REGION-nt nb',
|
|
29 'P5J-nt nb', '3V-REGION trimmed-nt nb', '5D-REGION trimmed-nt nb', '3D-REGION trimmed-nt nb', '5J-REGION trimmed-nt nb', u'Sample', u'Replicate']
|
|
30 if "N-REGION-nt nb" in files[0].columns:
|
|
31 columns.insert(30, "N-REGION-nt nb")
|
|
32 if blast_files is not 0:
|
|
33 print "Has a parsed blastn file, using limited columns."
|
|
34 columns = [u'ID', u'VDJ Frame', u'Top V Gene', u'Top D Gene', u'Top J Gene', u'CDR1 Seq', u'CDR1 Length', u'CDR2 Seq', u'CDR2 Length', u'CDR3 Seq', u'CDR3 Length', u'CDR3 Seq DNA', u'CDR3 Length DNA', u'Strand', u'CDR3 Found How', u'Sample', u'Replicate']
|
|
35
|
|
36 result = None
|
|
37 for patient_id, samples in patients.iteritems():
|
|
38 count = 1
|
|
39 for sample in samples:
|
|
40 sample['Sample'] = patient_id
|
|
41 sample['Replicate'] = str(count)
|
|
42 count += 1
|
|
43 if result is None:
|
|
44 result = sample[columns]
|
|
45 else:
|
|
46 result = result.append(sample[columns])
|
|
47 result.to_csv(sys.argv[-1], sep="\t", index=False, index_label="index")
|
|
48
|
|
49 if __name__ == "__main__":
|
|
50 main()
|