Mercurial > repos > drosofff > msp_sr_readmap_and_size_histograms
comparison readmap.py @ 0:ac7d8e55bb67 draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_readmap_and_size_histograms commit fe40dec87779c1fcfbd03330e653aa886f4a2cda
author | drosofff |
---|---|
date | Wed, 21 Oct 2015 11:13:18 -0400 |
parents | |
children | ebfc73c72652 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:ac7d8e55bb67 |
---|---|
1 #!/usr/bin/python | |
2 # python parser module for for readmaps and size distributions, guided by GFF3 | |
3 # version 0.9.1 (1-6-2014) | |
4 # Usage readmap.py <1:index source> <2:extraction directive> <3:output pre-mir> <4: output mature miRs> <5:mirbase GFF3> | |
5 # <6:pathToLatticeDataframe or "dummy_dataframe_path"> <7:Rcode or "dummy_plotCode"> <8:latticePDF or "dummy_latticePDF"> | |
6 # <9:10:11 filePath:FileExt:FileLabel> <.. ad lib> | |
7 | |
8 import sys, subprocess, argparse | |
9 from smRtools import * | |
10 from collections import OrderedDict, defaultdict | |
11 import os | |
12 | |
13 def Parser(): | |
14 the_parser = argparse.ArgumentParser() | |
15 the_parser.add_argument('--output_readmap', action="store", type=str, help="readmap dataframe") | |
16 the_parser.add_argument('--output_size_distribution', action="store", type=str, help="size distribution dataframe") | |
17 the_parser.add_argument('--reference_fasta', action="store", type=str, help="output file") | |
18 the_parser.add_argument('--reference_bowtie_index',action='store', help="paths to indexed or fasta references") | |
19 the_parser.add_argument('--input',nargs='+', help="paths to multiple input files") | |
20 the_parser.add_argument('--ext',nargs='+', help="input file type") | |
21 the_parser.add_argument('--label',nargs='+', help="labels of multiple input files") | |
22 the_parser.add_argument('--normalization_factor',nargs='+', type=float, help="Normalization factor for input file") | |
23 the_parser.add_argument('--gff', type=str, help="GFF containing regions of interest") | |
24 the_parser.add_argument('--minquery', type=int, help="Minimum readsize") | |
25 the_parser.add_argument('--maxquery', type=int, help="Maximum readsize") | |
26 the_parser.add_argument('--rcode', type=str, help="R script") | |
27 args = the_parser.parse_args() | |
28 return args | |
29 | |
30 args=Parser() | |
31 if args.reference_fasta: | |
32 genomeRefFormat = "fastaSource" | |
33 genomeRefFile = args.reference_fasta | |
34 if args.reference_bowtie_index: | |
35 genomeRefFormat = "bowtieIndex" | |
36 genomeRefFile = args.reference_bowtie_index | |
37 readmap_file=args.output_readmap | |
38 size_distribution_file=args.output_size_distribution | |
39 minquery=args.minquery | |
40 maxquery=args.maxquery | |
41 Rcode = args.rcode | |
42 filePath=args.input | |
43 fileExt=args.ext | |
44 fileLabel=args.label | |
45 normalization_factor=args.normalization_factor | |
46 | |
47 MasterListOfGenomes = OrderedDict() | |
48 | |
49 def process_samples(filePath): | |
50 for i, filePath in enumerate(filePath): | |
51 norm=normalization_factor[i] | |
52 print fileLabel[i] | |
53 MasterListOfGenomes[fileLabel[i]] = HandleSmRNAwindows (alignmentFile=filePath, alignmentFileFormat=fileExt[i], genomeRefFile=genomeRefFile, genomeRefFormat=genomeRefFormat,\ | |
54 biosample=fileLabel[i], size_inf=minquery, size_sup=maxquery, norm=norm) | |
55 return MasterListOfGenomes | |
56 | |
57 def dataframe_sanityzer (listofdatalines): | |
58 Dict = defaultdict(float) | |
59 for line in listofdatalines: | |
60 fields= line.split("\t") | |
61 Dict[fields[0]] += float (fields[2]) | |
62 filtered_list = [] | |
63 for line in listofdatalines: | |
64 fields= line.split("\t") | |
65 if Dict[fields[0]] != 0: | |
66 filtered_list.append(line) | |
67 return filtered_list | |
68 | |
69 def write_readplot_dataframe(readDict, readmap_file): | |
70 listoflines = [] | |
71 with open(readmap_file, 'w') as readmap: | |
72 print >>readmap, "gene\tcoord\tcount\tpolarity\tsample" | |
73 for sample in readDict.keys(): | |
74 if args.gff: | |
75 dict=readDict[sample] | |
76 else: | |
77 dict=readDict[sample].instanceDict | |
78 for gene in dict.keys(): | |
79 plottable = dict[gene].readplot() | |
80 for line in plottable: | |
81 #print >>readmap, "%s\t%s" % (line, sample) | |
82 listoflines.append ("%s\t%s" % (line, sample)) | |
83 listoflines = dataframe_sanityzer(listoflines) | |
84 for line in listoflines: | |
85 print >>readmap, line | |
86 | |
87 def write_size_distribution_dataframe(readDict, size_distribution_file): | |
88 listoflines = [] | |
89 with open(size_distribution_file, 'w') as size_distrib: | |
90 print >>size_distrib, "gene\tsize\tcount\tpolarity\tsample" # test before was "gene\tpolarity\tsize\tcount\tsample" | |
91 for sample in readDict.keys(): | |
92 if args.gff: | |
93 dict=readDict[sample] | |
94 else: | |
95 dict=readDict[sample].instanceDict | |
96 for gene in dict.keys(): | |
97 histogram = dict[gene].size_histogram(minquery=args.minquery, maxquery=args.maxquery) | |
98 for polarity in histogram.keys(): | |
99 if polarity=='both': | |
100 continue | |
101 #for size in xrange(args.minquery, args.maxquery): | |
102 # if not size in histogram[polarity].keys(): | |
103 # histogram[size]=0 | |
104 for size, count in histogram[polarity].iteritems(): | |
105 #print >>size_distrib, "%s\t%s\t%s\t%s\t%s" % (gene, size, count, polarity, sample) # test, changed the order accordingly | |
106 listoflines.append ("%s\t%s\t%s\t%s\t%s" % (gene, size, count, polarity, sample) ) | |
107 listoflines = dataframe_sanityzer(listoflines) | |
108 for line in listoflines: | |
109 print >>size_distrib, line | |
110 | |
111 def gff_item_subinstances(readDict, gff3): | |
112 GFFinstanceDict=OrderedDict() | |
113 for sample in readDict.keys(): | |
114 GFFinstanceDict[sample]={} # to implement the 2nd level of directionary in an OrderedDict Class object (would not be required with defaultdict Class) | |
115 with open(gff3) as gff: | |
116 for line in gff: | |
117 if line[0] == "#": continue | |
118 gff_fields = line[:-1].split("\t") | |
119 chrom = gff_fields[0] | |
120 gff_name = gff_fields[-1].split("Name=")[-1].split(";")[0] # to isolate the GFF Name | |
121 item_upstream_coordinate = int(gff_fields[3]) | |
122 item_downstream_coordinate = int(gff_fields[4]) | |
123 item_polarity = gff_fields[6] | |
124 for sample in readDict.keys(): | |
125 ## this is not required anymore but test | |
126 # if not GFFinstanceDict.has_key(sample): | |
127 # GFFinstanceDict[sample]={} | |
128 #### | |
129 subinstance=extractsubinstance(item_upstream_coordinate, item_downstream_coordinate, readDict[sample].instanceDict[chrom]) | |
130 if item_polarity == '-': | |
131 subinstance.readDict={key*-1:value for key, value in subinstance.readDict.iteritems()} | |
132 subinstance.gene=gff_name | |
133 GFFinstanceDict[sample][gff_name]=subinstance | |
134 return GFFinstanceDict | |
135 | |
136 MasterListOfGenomes=process_samples(filePath) | |
137 | |
138 if args.gff: | |
139 MasterListOfGenomes=gff_item_subinstances(MasterListOfGenomes, args.gff) | |
140 | |
141 write_readplot_dataframe(MasterListOfGenomes, readmap_file) | |
142 write_size_distribution_dataframe(MasterListOfGenomes, size_distribution_file) | |
143 | |
144 R_command="Rscript "+ Rcode | |
145 process = subprocess.Popen(R_command.split()) | |
146 process.wait() | |
147 |