Repository 'query_tabular'
hg clone https://toolshed.g2.bx.psu.edu/repos/jjohnson/query_tabular

Changeset 11:fd16243931d6 (2017-05-12)
Previous changeset 10:e84d1c3bf4fe (2017-03-02) Next changeset 12:a6aec10e5dbf (2017-05-12)
Commit message:
Uploaded
modified:
query_tabular.py
query_tabular.xml
added:
._query_tabular.py
._query_tabular.xml
test-data/pets.tsv
b
diff -r e84d1c3bf4fe -r fd16243931d6 ._query_tabular.py
b
Binary file ._query_tabular.py has changed
b
diff -r e84d1c3bf4fe -r fd16243931d6 ._query_tabular.xml
b
Binary file ._query_tabular.xml has changed
b
diff -r e84d1c3bf4fe -r fd16243931d6 query_tabular.py
--- a/query_tabular.py Thu Mar 02 13:43:25 2017 -0500
+++ b/query_tabular.py Fri May 12 10:18:42 2017 -0400
[
@@ -64,31 +64,71 @@
         self.source = source
         self.filter_dict = filter_dict
         # print >> sys.stderr, 'LineFilter %s' % filter_dict if filter_dict else 'NONE'
-        self.func = lambda l: l.rstrip('\r\n') if l else None
+        self.func = lambda i,l: l.rstrip('\r\n') if l else None
+        self.src_lines = []
+        self.src_line_cnt = 0
         if not filter_dict:
             return
         if filter_dict['filter'] == 'regex':
             rgx = re.compile(filter_dict['pattern'])
             if filter_dict['action'] == 'exclude_match':
-                self.func = lambda l: l if not rgx.match(l) else None
+                self.func = lambda i,l: l if not rgx.match(l) else None
             elif filter_dict['action'] == 'include_match':
-                self.func = lambda l: l if rgx.match(l) else None
+                self.func = lambda i,l: l if rgx.match(l) else None
             elif filter_dict['action'] == 'exclude_find':
-                self.func = lambda l: l if not rgx.search(l) else None
+                self.func = lambda i,l: l if not rgx.search(l) else None
             elif filter_dict['action'] == 'include_find':
-                self.func = lambda l: l if rgx.search(l) else None
+                self.func = lambda i,l: l if rgx.search(l) else None
         elif filter_dict['filter'] == 'replace':
             p = filter_dict['pattern']
             r = filter_dict['replace']
             c = int(filter_dict['column']) - 1
-            self.func = lambda l: '\t'.join([x if i != c else re.sub(p,r,x) for i,x in enumerate(l.split('\t'))])
+            self.func = lambda i,l: '\t'.join([x if i != c else re.sub(p,r,x) for i,x in enumerate(l.split('\t'))])
+        elif filter_dict['filter'] == 'prepend_line_num':
+            self.func = lambda i,l: '%d\t%s' % (i,l) 
+        elif filter_dict['filter'] == 'append_line_num':
+            self.func = lambda i,l: '%s\t%d' % (l.rstrip('\r\n'),i) 
+        elif filter_dict['filter'] == 'skip':
+            cnt = filter_dict['count']
+            self.func = lambda i,l: l if i > cnt else None
+        elif filter_dict['filter'] == 'normalize':
+            cols = [int(c) - 1 for c in filter_dict['columns']]
+            sep = filter_dict['separator']
+            self.func = lambda i,l: self.normalize(l,cols,sep)
     def __iter__(self):
         return self
-    def next(self):
+    def normalize(self,line,split_cols,sep):
+        lines = []
+        fields = line.rstrip('\r\n').split('\t')
+        split_fields = dict()
+        cnt = 0
+        for c in split_cols:
+            if c < len(fields):
+                split_fields[c] = fields[c].split(sep)
+                cnt = max(cnt, len(split_fields[c]))
+        if cnt == 0:
+            lines.append('\t'.join(fields))
+        else:
+            for n in range(0, cnt):
+                flds = [x if c not in split_cols else split_fields[c][n] if n < len(split_fields[c]) else ''  for (c, x) in enumerate(fields)]
+                lines.append('\t'.join(flds))
+        return lines
+    def get_lines(self):
         for i,next_line in enumerate(self.source):
-            line = self.func(next_line)
+            self.src_line_cnt += 1
+            line = self.func(self.src_line_cnt,next_line)
+            # print >> sys.stderr, 'LineFilter %s: %d %s' % (str(self.filter_dict),self.src_line_cnt,line)
             if line:
-                return line
+               if isinstance(line,list):
+                   self.src_lines.extend(line)
+               else:
+                   self.src_lines.append(line)
+               return
+    def next(self):
+        if not self.src_lines:
+            self.get_lines()
+        if self.src_lines:
+                return self.src_lines.pop(0)
         raise StopIteration
 
 
b
diff -r e84d1c3bf4fe -r fd16243931d6 query_tabular.xml
--- a/query_tabular.xml Thu Mar 02 13:43:25 2017 -0500
+++ b/query_tabular.xml Fri May 12 10:18:42 2017 -0400
[
b'@@ -1,4 +1,4 @@\n-<tool id="query_tabular" name="Query Tabular" version="2.0.0">\n+<tool id="query_tabular" name="Query Tabular" version="3.0.0">\n     <description>using sqlite sql</description>\n \n     <requirements>\n@@ -79,13 +79,24 @@\n   #set $input_filters = []\n   #for $fi in $tbl.input_opts.linefilters:\n     #if $fi.filter.filter_type == \'skip\':\n+      #set $skip_lines = None\n       #if str($fi.filter.skip_lines) != \'\':\n-        #set $jtbl[\'comment_lines\'] = int($fi.filter.skip_lines)\n+        #set $skip_lines = int($fi.filter.skip_lines)\n       #elif $tbl.table.metadata.comment_lines and $tbl.table.metadata.comment_lines > 0:\n-        #set $jtbl[\'comment_lines\'] = int($tbl.table.metadata.comment_lines)\n+        #set $skip_lines = int($tbl.table.metadata.comment_lines)\n+      #end if\n+      #if $skip_lines is not None:\n+        #set $filter_dict = dict()\n+        #set $filter_dict[\'filter\'] = str($fi.filter.filter_type)\n+        #set $filter_dict[\'count\'] = $skip_lines\n+        #silent $input_filters.append($filter_dict)\n       #end if\n     #elif $fi.filter.filter_type == \'comment\':\n-      #set $jtbl[\'comment_char\'] = str($fi.filter.comment_char)\n+      #set $filter_dict = dict()\n+      #set $filter_dict[\'filter\'] = \'regex\'\n+      #set $filter_dict[\'pattern\'] = \'^\' + str($fi.filter.comment_char)\n+      #set $filter_dict[\'action\'] = \'exclude\'\n+      #silent $input_filters.append($filter_dict)\n     #elif $fi.filter.filter_type == \'regex\':\n       #set $filter_dict = dict()\n       #set $filter_dict[\'filter\'] = str($fi.filter.filter_type)\n@@ -99,12 +110,16 @@\n       #set $filter_dict[\'pattern\'] = str($fi.filter.regex_pattern)\n       #set $filter_dict[\'replace\'] = str($fi.filter.regex_replace)\n       #silent $input_filters.append($filter_dict)\n-    ## #elif $fi.filter.filter_type == \'normalize\':\n-    ##   #set $filter_dict = dict()\n-    ##   #set $filter_dict[\'filter\'] = str($fi.filter.filter_type)\n-    ##   #set $filter_dict[\'columns\'] = [int(str($ci)) for $ci in str($fi.filter.columns).split(\',\')]\n-    ##   #set $filter_dict[\'separator\'] = str($fi.filter.separator)\n-    ##   #silent $input_filters.append($filter_dict)\n+    #elif str($fi.filter.filter_type).endswith(\'pend_line_num\'):\n+      #set $filter_dict = dict()\n+      #set $filter_dict[\'filter\'] = str($fi.filter.filter_type)\n+      #silent $input_filters.append($filter_dict)\n+    #elif $fi.filter.filter_type == \'normalize\':\n+      #set $filter_dict = dict()\n+      #set $filter_dict[\'filter\'] = str($fi.filter.filter_type)\n+      #set $filter_dict[\'columns\'] = [int(str($ci)) for $ci in str($fi.filter.columns).split(\',\')]\n+      #set $filter_dict[\'separator\'] = str($fi.filter.separator)\n+      #silent $input_filters.append($filter_dict)\n     #end if\n   #end for\n   #if $input_filters:\n@@ -131,9 +146,9 @@\n                             <option value="comment">comment char</option>\n                             <option value="regex">by regex expression matching</option>\n                             <option value="replace">regex replace value in column</option>\n-                            <!--\n+                            <option value="prepend_line_num">prepend a line number column</option>\n+                            <option value="append_line_num">append a line number column</option>\n                             <option value="normalize">normalize list columns, replicates row for each item in list</option>\n-                            -->\n                         </param>\n                         <when value="skip">\n                              <param name="skip_lines" type="integer" value="" min="0" optional="true" label="Skip lines" \n@@ -144,6 +159,8 @@\n                                 <sanitizer sanitize="False"/>\n                             </param>\n                         </when>\n+                        <when value="prepend_line_num"/>\n+                        <when value="append_line_num"/>\n                         <when value="regex">\n                             <pa'..b"rstName           LastName   DOB       PetNames  PetType\n+    2    Paula               Brown      24/05/78  Rex,Fluff dog,cat\n+    1    Steven              Jones      04/04/74  Allie     cat\n+    0    Jane                Doe        24/05/78  \n+    1    James               Smith      20/10/80  Spot\n+\n+\n+    Filter 1 - append a line number column:\n+\n+    #People with pets                                                 1\n+    Pets FirstName           LastName   DOB       PetNames  PetType   2\n+    2    Paula               Brown      24/05/78  Rex,Fluff dog,cat   3\n+    1    Steven              Jones      04/04/74  Allie     cat       4\n+    0    Jane                Doe        24/05/78                      5\n+    1    James               Smith      20/10/80  Spot                6\n+\n+    Filter 2 - by regex expression matching [include]: '^\\d+' (include lines that start with a number) \n+\n+    2    Paula               Brown      24/05/78  Rex,Fluff dog,cat   3\n+    1    Steven              Jones      04/04/74  Allie     cat       4\n+    0    Jane                Doe        24/05/78                      5\n+    1    James               Smith      20/10/80  Spot                6\n+\n+    Filter 3 - append a line number column:\n+\n+    2    Paula               Brown      24/05/78  Rex,Fluff dog,cat   3  1\n+    1    Steven              Jones      04/04/74  Allie     cat       4  2\n+    0    Jane                Doe        24/05/78                      5  3\n+    1    James               Smith      20/10/80  Spot                6  4\n+\n+    Filter 4 - regex replace value in column[4]: '(\\d+)/(\\d+)/(\\d+)' '19\\3-\\2-\\1' (convert dates to sqlite format) \n+\n+    2    Paula               Brown      1978-05-24  Rex,Fluff dog,cat   3  1\n+    1    Steven              Jones      1974-04-04  Allie     cat       4  2\n+    0    Jane                Doe        1978-05-24                      5  3\n+    1    James               Smith      1980-10-20  Spot                6  4\n+\n+    Filter 5 - normalize list columns[5,6]:\n+\n+    2    Paula               Brown      1978-05-24  Rex       dog       3  1\n+    2    Paula               Brown      1978-05-24  Fluff     cat       3  1\n+    1    Steven              Jones      1974-04-04  Allie     cat       4  2\n+    0    Jane                Doe        1978-05-24                      5  3\n+    1    James               Smith      1980-10-20  Spot                6  4\n+\n+    Filter 6 - append a line number column:\n+\n+    2    Paula               Brown      1978-05-24  Rex       dog       3  1  1\n+    2    Paula               Brown      1978-05-24  Fluff     cat       3  1  2\n+    1    Steven              Jones      1974-04-04  Allie     cat       4  2  3\n+    0    Jane                Doe        1978-05-24                      5  3  4\n+    1    James               Smith      1980-10-20  Spot                6  4  5\n+\n+\n+   Table name: pets\n+\n+   Table columns: Pets,FirstName,LastName,Birthdate,PetNames,PetType,line_num,entry_num,row_num\n+\n+   Query: SELECT * FROM pets \n+\n+   Result:\n+\n+     =====  =========  ========  ==========  ========  =======  ========  =========  ======= \n+     #Pets  FirstName  LastName  Brithdate   PetNames  PetType  line_num  entry_num  row_num\n+     =====  =========  ========  ==========  ========  =======  ========  =========  ======= \n+     2      Paula      Brown     1978-05-24  Rex       dog      3         1          1\n+     2      Paula      Brown     1978-05-24  Fluff     cat      3         1          2\n+     1      Steven     Jones     1974-04-04  Allie     cat      4         2          3\n+     0      Jane       Doe       1978-05-24                     5         3          4\n+     1      James      Smith     1980-10-20  Spot               6         4          5          \n+     =====  =========  ========  ==========  ========  =======  ========  =========  ======= \n+\n+\n .. _Regular_expression: https://docs.python.org/release/2.7/library/re.html\n .. _SQLite: http://www.sqlite.org/index.html\n \n"
b
diff -r e84d1c3bf4fe -r fd16243931d6 test-data/pets.tsv
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/pets.tsv Fri May 12 10:18:42 2017 -0400
b
@@ -0,0 +1,7 @@
+#People with pets
+Pets FirstName LastName DOB PetNames PetType
+2 Paula Brown 24/05/78 Rex,Fluff dog,cat
+1 Steven Jones 04/04/74 Allie cat
+0 Jane Doe 24/05/78
+1 James Smith 20/10/80 Spot
+