comparison size_histogram.py @ 0:ef64759eb181 draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/msp_sr_size_histograms commit fe40dec87779c1fcfbd03330e653aa886f4a2cda
author drosofff
date Wed, 21 Oct 2015 11:38:40 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:ef64759eb181
1 #!/usr/bin/python
2 # python parser module for size distributions, guided by GFF3
3 # version 0.9.1 (1-6-2014)
4 # Usage readmap.py <1:index source> <2:extraction directive> <3:output pre-mir> <4: output mature miRs> <5:mirbase GFF3>
5 # <6:pathToLatticeDataframe or "dummy_dataframe_path"> <7:Rcode or "dummy_plotCode"> <8:latticePDF or "dummy_latticePDF">
6 # <9:10:11 filePath:FileExt:FileLabel> <.. ad lib>
7
8 import sys, subprocess, argparse
9 from smRtools import *
10 from collections import OrderedDict, defaultdict
11 import os
12
13 def Parser():
14 the_parser = argparse.ArgumentParser()
15 the_parser.add_argument('--output_size_distribution', action="store", type=str, help="size distribution dataframe")
16 the_parser.add_argument('--reference_fasta', action="store", type=str, help="output file")
17 the_parser.add_argument('--reference_bowtie_index',action='store', help="paths to indexed or fasta references")
18 the_parser.add_argument('--input',nargs='+', help="paths to multiple input files")
19 the_parser.add_argument('--ext',nargs='+', help="input file type")
20 the_parser.add_argument('--label',nargs='+', help="labels of multiple input files")
21 the_parser.add_argument('--normalization_factor',nargs='+', type=float, help="Normalization factor for input file")
22 the_parser.add_argument('--gff', type=str, help="GFF containing regions of interest")
23 the_parser.add_argument('--minquery', type=int, help="Minimum readsize")
24 the_parser.add_argument('--maxquery', type=int, help="Maximum readsize")
25 the_parser.add_argument('--rcode', type=str, help="R script")
26 the_parser.add_argument('--global_size', action="store_true", help="if specified, size distribution is calcilated for the sum of all items")
27 the_parser.add_argument('--collapse', action="store_true", help="if specified, forward and reverse reads are collapsed")
28 args = the_parser.parse_args()
29 return args
30
31 args=Parser()
32 if args.reference_fasta:
33 genomeRefFormat = "fastaSource"
34 genomeRefFile = args.reference_fasta
35 if args.reference_bowtie_index:
36 genomeRefFormat = "bowtieIndex"
37 genomeRefFile = args.reference_bowtie_index
38 size_distribution_file=args.output_size_distribution
39 minquery=args.minquery
40 maxquery=args.maxquery
41 Rcode = args.rcode
42 filePath=args.input
43 fileExt=args.ext
44 fileLabel=args.label
45 normalization_factor=args.normalization_factor
46 global_size=args.global_size
47 collapse=args.collapse
48
49 if collapse:
50 pol=["both"]
51 else:
52 pol=["F", "R"]
53
54 MasterListOfGenomes = OrderedDict()
55
56 def process_samples(filePath):
57 for i, filePath in enumerate(filePath):
58 norm=normalization_factor[i]
59 print fileLabel[i]
60 MasterListOfGenomes[fileLabel[i]] = HandleSmRNAwindows (alignmentFile=filePath, alignmentFileFormat=fileExt[i], genomeRefFile=genomeRefFile, genomeRefFormat=genomeRefFormat,\
61 biosample=fileLabel[i], size_inf=minquery, size_sup=maxquery, norm=norm)
62 return MasterListOfGenomes
63
64 def write_size_distribution_dataframe(readDict, size_distribution_file, pol=["both"] ):
65 '''refactored on 7-9-2014'''
66 with open(size_distribution_file, 'w') as size_distrib:
67 print >>size_distrib, "gene\tpolarity\tsize\tcount\tsample"
68 for sample in readDict.keys():
69 if args.gff:
70 dict=readDict[sample]
71 else:
72 dict=readDict[sample].instanceDict
73 for gene in dict.keys():
74 histogram = dict[gene].size_histogram()
75 for polarity in pol:
76 for size, count in histogram[polarity].iteritems():
77 print >>size_distrib, "%s\t%s\t%s\t%s\t%s" % (gene, polarity, size, count, sample)
78
79 def write_size_distribution_dataframe_global(readDict, size_distribution_file, pol=["both"]):
80 with open(size_distribution_file, 'w') as size_distrib:
81 print >>size_distrib, "gene\tpolarity\tsize\tcount\tsample"
82 for sample in readDict.keys():
83 histogram = readDict[sample].size_histogram()
84 gene="sample"
85 for polarity in pol:
86 for size, count in histogram[polarity].iteritems():
87 print >>size_distrib, "%s\t%s\t%s\t%s\t%s" % (gene, polarity, size, count, sample)
88
89 def gff_item_subinstances(readDict, gff3):
90 GFFinstanceDict=OrderedDict()
91 with open(gff3) as gff:
92 for line in gff:
93 if line[0] == "#": continue
94 gff_fields = line[:-1].split("\t")
95 chrom = gff_fields[0]
96 gff_name = gff_fields[-1].split("Name=")[-1].split(";")[0] # to isolate the GFF Name
97 item_upstream_coordinate = int(gff_fields[3])
98 item_downstream_coordinate = int(gff_fields[4])
99 item_polarity = gff_fields[6]
100 for sample in readDict.keys():
101 if not GFFinstanceDict.has_key(sample):
102 GFFinstanceDict[sample]={}
103 subinstance=extractsubinstance(item_upstream_coordinate, item_downstream_coordinate, readDict[sample].instanceDict[chrom])
104 if item_polarity == '-':
105 subinstance.readDict={key*-1:value for key, value in subinstance.readDict.iteritems()}
106 # subinstance.readDict.setdefault(key, [])
107 subinstance.gene=gff_name
108 GFFinstanceDict[sample][gff_name]=subinstance
109 return GFFinstanceDict
110
111 MasterListOfGenomes=process_samples(filePath)
112
113 if args.gff:
114 MasterListOfGenomes=gff_item_subinstances(MasterListOfGenomes, args.gff)
115
116 if global_size:
117 write_size_distribution_dataframe_global(MasterListOfGenomes, size_distribution_file, pol)
118 else:
119 write_size_distribution_dataframe(MasterListOfGenomes, size_distribution_file, pol)
120
121 R_command="Rscript "+ Rcode
122 process = subprocess.Popen(R_command.split())
123 process.wait()
124
125