changeset 9:69b08fc9557c draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/query_tabular commit daa9af57fe07ee83a45ddc9f855716f9d14a8e12"
author iuc
date Sat, 12 Sep 2020 01:21:45 +0000
parents 59278960a2e7
children 9cc411a62277
files filter_tabular.xml filters.py load_db.py macros.xml query_db.py test-data/filtered_customers_results.tsv
diffstat 6 files changed, 88 insertions(+), 8 deletions(-) [+]
line wrap: on
line diff
--- a/filter_tabular.xml	Thu Jan 23 07:36:21 2020 -0500
+++ b/filter_tabular.xml	Sat Sep 12 01:21:45 2020 +0000
@@ -1,4 +1,4 @@
-<tool id="filter_tabular" name="Filter Tabular" version="2.0.1">
+<tool id="filter_tabular" name="Filter Tabular" version="3.1.0">
     <description></description>
 
     <macros>
@@ -101,6 +101,35 @@
             <output name="output" file="filtered_pets_results.tsv"/>
         </test>
 
+        <test>
+            <param name="input" ftype="tabular" value="customers.tsv"/>
+            <repeat name="linefilters">
+                <conditional name="filter">
+                    <param name="filter_type" value="regex"/>
+                    <param name="regex_pattern" value="^\d+"/>
+                    <param name="regex_action" value="include_find"/>
+                </conditional>
+            </repeat>
+            <repeat name="linefilters">
+                <conditional name="filter">
+                    <param name="filter_type" value="replace"/>
+                    <param name="column" value="c4"/>
+                    <param name="regex_pattern" value=".*@(.*)"/>
+                    <param name="regex_replace" value="\1"/>
+                    <param name="add" value="after"/>
+                </conditional>
+            </repeat>
+            <repeat name="linefilters">
+                <conditional name="filter">
+                    <param name="filter_type" value="replace"/>
+                    <param name="column" value="c4"/>
+                    <param name="regex_pattern" value=".*(com|org|net)$"/>
+                    <param name="regex_replace" value="\1"/>
+                    <param name="add" value="append"/>
+                </conditional>
+            </repeat>
+            <output name="output" file="filtered_customers_results.tsv"/>
+        </test>
     </tests>
     <help><![CDATA[
 ==============
--- a/filters.py	Thu Jan 23 07:36:21 2020 -0500
+++ b/filters.py	Sat Sep 12 01:21:45 2020 +0000
@@ -32,9 +32,20 @@
             p = filter_dict['pattern']
             r = filter_dict['replace']
             c = int(filter_dict['column']) - 1
-            self.func = lambda i, l: '\t'.join(
-                [x if j != c else re.sub(p, r, x)
-                 for j, x in enumerate(l.split('\t'))])
+            if 'add' not in filter_dict\
+                or filter_dict['add'] not in ['prepend',
+                                              'append',
+                                              'before',
+                                              'after']:
+                self.func = lambda i, l: '\t'.join(
+                    [x if j != c else re.sub(p, r, x)
+                     for j, x in enumerate(l.split('\t'))])
+            else:
+                a = 0 if filter_dict['add'] == 'prepend'\
+                    else min(0, c - 1) if filter_dict['add'] == 'before'\
+                    else c + 1 if filter_dict['add'] == 'after'\
+                    else None
+                self.func = lambda i, l: self.replace_add(l, p, r, c, a)
         elif filter_dict['filter'] == 'prepend_line_num':
             self.func = lambda i, l: '%d\t%s' % (i, l)
         elif filter_dict['filter'] == 'append_line_num':
@@ -69,6 +80,14 @@
         fields = line.split('\t')
         return '\t'.join([fields[x] for x in cols])
 
+    def replace_add(self, line, pat, rep, col, pos):
+        fields = line.rstrip('\r\n').split('\t')
+        i = pos if pos else len(fields)
+        val = ''
+        if col < len(fields) and re.search(pat, fields[col]):
+            val = re.sub(pat, rep, fields[col]).replace('\t', ' ')
+        return '\t'.join(fields[:i] + [val] + fields[i:])
+
     def normalize(self, line, split_cols, sep):
         lines = []
         fields = line.rstrip('\r\n').split('\t')
--- a/load_db.py	Thu Jan 23 07:36:21 2020 -0500
+++ b/load_db.py	Sat Sep 12 01:21:45 2020 +0000
@@ -176,7 +176,7 @@
 
 
 def get_column_def(file_path, table_name, skip=0, comment_char='#',
-                   column_names=None, max_lines=100, load_named_columns=False,
+                   column_names=None, max_lines=1000, load_named_columns=False,
                    firstlinenames=False, filters=None):
     col_pref = ['TEXT', 'REAL', 'INTEGER', None]
     col_types = []
@@ -272,8 +272,26 @@
                         if x else None for i, x in enumerate(fields)]
                 c.execute(insert_stmt, vals)
             except Exception as e:
-                print('Failed at line: %d err: %s' % (linenum, e),
+                print('Load %s Failed line: %d err: %s' % (file_path, linenum, e),
                       file=sys.stderr)
+                for i, val in enumerate(fields):
+                    try:
+                        col_func[i](val)
+                    except Exception:
+                        colType = getValueType(val)
+                        col_func[i] = float if colType == 'REAL' else int if colType == 'INTEGER' else str
+                        print('Changing %s from %s to %s' % (col_names[i], col_types[i], colType),
+                              file=sys.stderr)
+                        col_types[i] = colType
+                vals = [col_func[i](x)
+                        if x else None for i, x in enumerate(fields)]
+                print('%s  %s' % (insert_stmt, vals),
+                      file=sys.stderr)
+                try:
+                    c.execute(insert_stmt, vals)
+                except Exception as e:
+                    print('Insert %s line: %d Failed err: %s' % (file_path, linenum, e),
+                          file=sys.stderr)
         conn.commit()
         c.close()
         for i, index in enumerate(unique_indexes):
--- a/macros.xml	Thu Jan 23 07:36:21 2020 -0500
+++ b/macros.xml	Sat Sep 12 01:21:45 2020 +0000
@@ -40,6 +40,9 @@
       #set $filter_dict['column'] = int(str($fi.filter.column).replace('c',''))
       #set $filter_dict['pattern'] = str($fi.filter.regex_pattern)
       #set $filter_dict['replace'] = str($fi.filter.regex_replace)
+      #if $fi.filter.add:
+          #set $filter_dict['add'] = str($fi.filter.add)
+      #end if
       #silent $input_filters.append($filter_dict)
     #elif str($fi.filter.filter_type).endswith('pend_line_num'):
       #set $filter_dict = dict()
@@ -172,7 +175,7 @@
                         <when value="replace">
                             <param name="column" type="text" value="" label="enter column number to replace"
                                 help="example: 1 or c1 (selects the first column)">
-                                <validator type="regex" message="Column ordinal position separated by commas">^(c?[1-9]\d*)$</validator>
+                                <validator type="regex" message="Column ordinal position">^(c?[1-9]\d*)$</validator>
                             </param>
                             <param name="regex_pattern" type="text" value="" label="regex pattern">
                                 <sanitizer sanitize="False"/>
@@ -180,6 +183,12 @@
                             <param name="regex_replace" type="text" value="" label="replacement expression">
                                 <sanitizer sanitize="False"/>
                             </param>
+                            <param name="add" type="select" optional="true" label="Instead of replacing, Add as new column:">
+                                <option value="prepend">prepend to beginning of row</option>
+                                <option value="append">append to the end of row</option>
+                                <option value="before">insert before column field</option>
+                                <option value="after">insert after column field</option>
+                            </param>
                         </when>
                         <when value="normalize">
                             <param name="columns" type="text" value="" label="enter column numbers to normalize">
@@ -211,6 +220,7 @@
   - by regex expression matching    *include/exclude* lines the match the regex expression 
   - select columns                  choose to include only selected columns in the order specified 
   - regex replace value in column   replace a field in a column using a regex substitution (good for date reformatting)
+  - regex replace value in column   add a new column using a regex substitution of a column value
   - prepend a line number column    each line has the ordinal value of the line read by this filter as the first column
   - append a line number column     each line has the ordinal value of the line read by this filter as the last column
   - prepend a text column           each line has the text string as the first column
--- a/query_db.py	Thu Jan 23 07:36:21 2020 -0500
+++ b/query_db.py	Sat Sep 12 01:21:45 2020 +0000
@@ -52,7 +52,7 @@
             except Exception as exc:
                 print("Warning: %s" % exc, file=sys.stderr)
     except Exception as e:
-        exit('Error: %s' % (e))
+        exit('describe_tables Error: %s' % (e))
     exit(0)
 
 
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/filtered_customers_results.tsv	Sat Sep 12 01:21:45 2020 +0000
@@ -0,0 +1,4 @@
+1	John	Smith	John.Smith@yahoo.com	yahoo.com	1968-02-04	626 222-2222	com
+2	Steven	Goldfish	goldfish@fishhere.net	fishhere.net	1974-04-04	323 455-4545	net
+3	Paula	Brown	pb@herowndomain.org	herowndomain.org	1978-05-24	416 323-3232	org
+4	James	Smith	jim@supergig.co.uk	supergig.co.uk	1980-10-20	416 323-8888