comparison join_files_on_column_fuzzy.py @ 0:64469e7ecf9f draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/join_files_on_column_fuzzy commit fd2bc86108994c9eda731b305ca6a8c71554cfaa
author bgruening
date Sun, 26 Nov 2017 16:13:51 -0500
parents
children f2068690addc
comparison
equal deleted inserted replaced
-1:000000000000 0:64469e7ecf9f
1 #!/usr/bin/env python
2
3 import os
4 import argparse
5 import sys
6
7 def main(args):
8
9 if args.header:
10 h1 = True
11 h2 = True
12 else:
13 h1 = False
14 h2 = False
15
16 cache = list()
17 out = open(args.outfile, 'w+')
18 write_buffer = list()
19
20 def _readline(header = False):
21 with open(args.f2) as handle2:
22 for line in handle2:
23 line = line.strip()
24 if header:
25 header = False
26 yield line
27 continue
28 if not line:
29 continue
30 columns = line.split(args.sep)
31 value2 = columns[args.c2-1]
32 yield columns, float(value2)
33
34 def fill_cache():
35 try:
36 cache.append(next(it))
37 except StopIteration:
38 pass
39
40 it = _readline(header = h2)
41
42 with open(args.f1) as handle1:
43 for line in handle1:
44 line = line.strip()
45 if h1:
46 h1 = False
47 seconda_header = next(it)
48 if args.add_distance:
49 out.write('%s\t%s\t%s\n' % (line, seconda_header, args.unit))
50 else:
51 out.write('%s\t%s\n' % (line, seconda_header))
52 continue
53 if not line:
54 continue
55 columns = line.split(args.sep)
56 value1 = float(columns[args.c1-1])
57 _cache = list()
58 fill_cache()
59 while cache:
60 _c, value2 = cache.pop(0)
61 upper_bound = value1 + args.distance
62 if args.unit == 'absolute':
63 if value2 <= upper_bound and value2 >= (value1 - args.distance):
64 line_template = '%s\n'
65 abs_dist = abs(value1 - value2)
66 if args.add_distance:
67 line_template = '%s\t' + str(abs_dist) + '\n'
68 write_buffer.append([abs_dist, line_template % '\t'.join( columns + _c )])
69 _cache.append([_c, value2])
70 fill_cache()
71 elif value2 > upper_bound:
72 # if the value from list 2 is bigger then the current value, he will be taken into the next round
73 _cache.append([_c, value2])
74 elif value2 < upper_bound:
75 # if the value from list 2 is smaller then the currecnt value, check the next one of list 2
76 fill_cache()
77 elif args.unit == 'ppm':
78 ppm_dist = abs((value1 - value2) / value1 * 1000000)
79 if ppm_dist <= args.distance:
80 line_template = '%s\n'
81 if args.add_distance:
82 line_template = '%s\t' + str(ppm_dist) + '\n'
83 write_buffer.append([ppm_dist, line_template % '\t'.join( columns + _c )])
84 _cache.append([_c, value2])
85 fill_cache()
86 elif ppm_dist > args.distance:
87 _cache.append([_c, value2])
88 elif ppm_dist < args.distance:
89 fill_cache()
90 if args.closest and write_buffer:
91 write_buffer.sort(key=lambda x: x[0])
92 out.write(write_buffer[0][1])
93 else:
94 for _dist, line in write_buffer:
95 out.write(line)
96 write_buffer = list()
97 cache = _cache
98 out.close()
99
100
101 if __name__ == '__main__':
102
103 parser = argparse.ArgumentParser(description='Merge two files on a common column the fuzzy way.')
104 parser.add_argument('--f1', required=True)
105 parser.add_argument('--f2', required=True)
106 parser.add_argument('--c1', type=int, required=True, help="Column in file 1 to be merged on.")
107 parser.add_argument('--c2', type=int, required=True, help="Column in file 2 to be merged on.")
108 parser.add_argument('--outfile', required=True)
109 parser.add_argument('--header', action='store_true', help="The files have a header line at the beginning.")
110 parser.add_argument('--closest', action='store_true', help="Only report the closest match.")
111 parser.add_argument('--add_distance', action='store_true', help="Add addional column with the distance between the two values.")
112 parser.add_argument('--sep', type=str, default="\t", help="Files are separated by this separator.")
113 parser.add_argument('--distance', type=float, default="0.2", help="Maximal allowed distance.")
114 parser.add_argument('--unit', choices=['ppm', 'absolute'], default='absolute')
115 args = parser.parse_args()
116
117 main(args)
118
119