diff cwpair2_util.py @ 6:c4b926c9831c draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/cwpair2 commit ca66053a0d915edba3d7f1d4ce014718e7f4f0f2
author iuc
date Thu, 02 May 2019 08:30:22 -0400
parents 71188f3f4b76
children
line wrap: on
line diff
--- a/cwpair2_util.py	Mon Nov 06 23:19:50 2017 -0500
+++ b/cwpair2_util.py	Thu May 02 08:30:22 2019 -0400
@@ -15,7 +15,7 @@
 # Data output formats
 GFF_EXT = 'gff'
 TABULAR_EXT = 'tabular'
-# Statistics historgrams output directory.
+# Statistics histograms output directory.
 HISTOGRAM = 'H'
 # Statistics outputs
 FINAL_PLOTS = 'F'
@@ -46,7 +46,7 @@
 
     def get_bin(self, x):
         """
-        Returns the bin in which a data point falls
+        Returns the centre of the bin in which a data point falls
         """
         return self.start + (x - self.start) // self.binsize * self.binsize + self.binsize / 2.0
 
@@ -64,7 +64,12 @@
         return x, y
 
     def mode(self):
-        return max(self.dist.items(), key=lambda data: data[1])[0]
+        # There could be more than one mode for a frequency distribution,
+        # return the median of the modes to be consistent
+        max_frequency = max(self.dist.values())
+        modes = sorted(_[0] for _ in self.dist.items() if _[1] == max_frequency)
+        median_index = len(modes) // 2
+        return modes[median_index]
 
     def size(self):
         return sum(self.dist.values())
@@ -76,7 +81,7 @@
 
 
 def distance(peak1, peak2):
-    return (peak2[1] + peak2[2]) / 2 - (peak1[1] + peak1[2]) / 2
+    return (peak2[1] + peak2[2]) / 2.0 - (peak1[1] + peak1[2]) / 2.0
 
 
 def gff_row(cname, start, end, score, source, type='.', strand='.', phase='.', attrs={}):
@@ -92,9 +97,11 @@
 def parse_chromosomes(reader):
     # This version of cwpair2 accepts only gff format as input.
     chromosomes = {}
-    next(reader)
     for line in reader:
-        cname, junk, junk, start, end, value, strand, junk, junk = line
+        line = line.rstrip("\r\n")
+        if not line or line.startswith('#'):
+            continue
+        cname, _, _, start, end, value, strand, _, _ = line.split("\t")
         start = int(start)
         end = int(end)
         value = float(value)
@@ -118,7 +125,7 @@
     return values[int(len(values) * 0.95)]
 
 
-def filter(chromosomes, threshold=0.05):
+def peak_filter(chromosomes, threshold):
     """
     Filters the peaks to those above a threshold. Threshold < 1.0 is interpreted
     as a proportion of the maximum, >=1.0 as an absolute value.
@@ -139,7 +146,7 @@
 
 def all_pair_distribution(chromosomes, up_distance, down_distance, binsize):
     dist = FrequencyDistribution(-up_distance, down_distance, binsize=binsize)
-    for cname, data in chromosomes.items():
+    for data in chromosomes.values():
         watson, crick = split_strands(data)
         crick.sort(key=lambda data: float(data[1]))
         keys = make_keys(crick)
@@ -256,7 +263,6 @@
     output_orphans = output_files in ["all", "matched_pair_orphan", "matched_pair_orphan_detail"]
     # Keep track of statistics for the output file
     statistics = {}
-    input = csv.reader(open(dataset_path, 'rt'), delimiter='\t')
     fpath, fname = os.path.split(dataset_path)
     statistics['fname'] = '%s: data %s' % (method, str(galaxy_hid))
     statistics['dir'] = fpath
@@ -276,12 +282,13 @@
     def td_writer(output_type, extension, fname):
         # Returns a tab-delimited writer for a specified output.
         output_file_path = make_path(output_type, extension, fname)
-        return csv.writer(open(output_file_path, 'wt'), delimiter='\t')
+        return csv.writer(open(output_file_path, 'wt'), delimiter='\t', lineterminator="\n")
 
-    try:
-        chromosomes = parse_chromosomes(input)
-    except Exception:
-        stop_err('Unable to parse file "%s".\n%s' % (dataset_path, traceback.format_exc()))
+    with open(dataset_path, 'rt') as input:
+        try:
+            chromosomes = parse_chromosomes(input)
+        except Exception:
+            stop_err('Unable to parse file "%s".\n%s' % (dataset_path, traceback.format_exc()))
     if output_details:
         # Details
         detailed_output = td_writer('data_%s' % DETAILS, TABULAR_EXT, fname)
@@ -303,8 +310,8 @@
         statistics['graph_path'] = make_histogram_path(STATS_GRAPH, fname)
     statistics['perc95'] = perc95(chromosomes)
     if threshold > 0:
-        # Apply filter
-        filter(chromosomes, threshold)
+        # Apply peak_filter
+        peak_filter(chromosomes, threshold)
     if method == 'mode':
         freq = all_pair_distribution(chromosomes, up_distance, down_distance, binsize)
         mode = freq.mode()