Mercurial > repos > bgruening > join_files_on_column_fuzzy
comparison join_files_on_column_fuzzy.py @ 0:64469e7ecf9f draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/join_files_on_column_fuzzy commit fd2bc86108994c9eda731b305ca6a8c71554cfaa
author | bgruening |
---|---|
date | Sun, 26 Nov 2017 16:13:51 -0500 |
parents | |
children | f2068690addc |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:64469e7ecf9f |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 import os | |
4 import argparse | |
5 import sys | |
6 | |
7 def main(args): | |
8 | |
9 if args.header: | |
10 h1 = True | |
11 h2 = True | |
12 else: | |
13 h1 = False | |
14 h2 = False | |
15 | |
16 cache = list() | |
17 out = open(args.outfile, 'w+') | |
18 write_buffer = list() | |
19 | |
20 def _readline(header = False): | |
21 with open(args.f2) as handle2: | |
22 for line in handle2: | |
23 line = line.strip() | |
24 if header: | |
25 header = False | |
26 yield line | |
27 continue | |
28 if not line: | |
29 continue | |
30 columns = line.split(args.sep) | |
31 value2 = columns[args.c2-1] | |
32 yield columns, float(value2) | |
33 | |
34 def fill_cache(): | |
35 try: | |
36 cache.append(next(it)) | |
37 except StopIteration: | |
38 pass | |
39 | |
40 it = _readline(header = h2) | |
41 | |
42 with open(args.f1) as handle1: | |
43 for line in handle1: | |
44 line = line.strip() | |
45 if h1: | |
46 h1 = False | |
47 seconda_header = next(it) | |
48 if args.add_distance: | |
49 out.write('%s\t%s\t%s\n' % (line, seconda_header, args.unit)) | |
50 else: | |
51 out.write('%s\t%s\n' % (line, seconda_header)) | |
52 continue | |
53 if not line: | |
54 continue | |
55 columns = line.split(args.sep) | |
56 value1 = float(columns[args.c1-1]) | |
57 _cache = list() | |
58 fill_cache() | |
59 while cache: | |
60 _c, value2 = cache.pop(0) | |
61 upper_bound = value1 + args.distance | |
62 if args.unit == 'absolute': | |
63 if value2 <= upper_bound and value2 >= (value1 - args.distance): | |
64 line_template = '%s\n' | |
65 abs_dist = abs(value1 - value2) | |
66 if args.add_distance: | |
67 line_template = '%s\t' + str(abs_dist) + '\n' | |
68 write_buffer.append([abs_dist, line_template % '\t'.join( columns + _c )]) | |
69 _cache.append([_c, value2]) | |
70 fill_cache() | |
71 elif value2 > upper_bound: | |
72 # if the value from list 2 is bigger then the current value, he will be taken into the next round | |
73 _cache.append([_c, value2]) | |
74 elif value2 < upper_bound: | |
75 # if the value from list 2 is smaller then the currecnt value, check the next one of list 2 | |
76 fill_cache() | |
77 elif args.unit == 'ppm': | |
78 ppm_dist = abs((value1 - value2) / value1 * 1000000) | |
79 if ppm_dist <= args.distance: | |
80 line_template = '%s\n' | |
81 if args.add_distance: | |
82 line_template = '%s\t' + str(ppm_dist) + '\n' | |
83 write_buffer.append([ppm_dist, line_template % '\t'.join( columns + _c )]) | |
84 _cache.append([_c, value2]) | |
85 fill_cache() | |
86 elif ppm_dist > args.distance: | |
87 _cache.append([_c, value2]) | |
88 elif ppm_dist < args.distance: | |
89 fill_cache() | |
90 if args.closest and write_buffer: | |
91 write_buffer.sort(key=lambda x: x[0]) | |
92 out.write(write_buffer[0][1]) | |
93 else: | |
94 for _dist, line in write_buffer: | |
95 out.write(line) | |
96 write_buffer = list() | |
97 cache = _cache | |
98 out.close() | |
99 | |
100 | |
101 if __name__ == '__main__': | |
102 | |
103 parser = argparse.ArgumentParser(description='Merge two files on a common column the fuzzy way.') | |
104 parser.add_argument('--f1', required=True) | |
105 parser.add_argument('--f2', required=True) | |
106 parser.add_argument('--c1', type=int, required=True, help="Column in file 1 to be merged on.") | |
107 parser.add_argument('--c2', type=int, required=True, help="Column in file 2 to be merged on.") | |
108 parser.add_argument('--outfile', required=True) | |
109 parser.add_argument('--header', action='store_true', help="The files have a header line at the beginning.") | |
110 parser.add_argument('--closest', action='store_true', help="Only report the closest match.") | |
111 parser.add_argument('--add_distance', action='store_true', help="Add addional column with the distance between the two values.") | |
112 parser.add_argument('--sep', type=str, default="\t", help="Files are separated by this separator.") | |
113 parser.add_argument('--distance', type=float, default="0.2", help="Maximal allowed distance.") | |
114 parser.add_argument('--unit', choices=['ppm', 'absolute'], default='absolute') | |
115 args = parser.parse_args() | |
116 | |
117 main(args) | |
118 | |
119 |