changeset 1:e83e0ce8fb68

Add option to reverse the search, find reference field in input field
author Jim Johnson <jj@umn.edu>
date Wed, 13 Aug 2014 15:01:33 -0500
parents e7e56b51d156
children c4fd2ea4f988
files find_in_reference.py find_in_reference.xml test-data/found_proteins.tabular
diffstat 3 files changed, 19 insertions(+), 5 deletions(-) [+]
line wrap: on
line diff
--- a/find_in_reference.py	Wed Feb 05 08:12:47 2014 -0500
+++ b/find_in_reference.py	Wed Aug 13 15:01:33 2014 -0500
@@ -41,6 +41,7 @@
   parser.add_option('-c','--input_column', dest='input_column', default=None, help='The column for the value in the input file. (first column = 1, default to last column)')
   parser.add_option('-C','--reference_column', dest='reference_column', default=None, help='The column for the value in the reference file. (first column = 1, default to last column)')
   parser.add_option( '-I', '--case_insensitive', dest='ignore_case', action="store_true", default=False, help='case insensitive' )
+  parser.add_option( '-R', '--reverse_find', dest='reverse_find', action="store_true", default=False, help='find the reference string in the input string' )
   parser.add_option( '-k', '--keep', dest='keep', action="store_true", default=False, help='' )
   parser.add_option( '-a', '--annotation_columns', dest='annotation_columns', default=None, help='If string is found, add these columns from reference' )
   parser.add_option( '-s', '--annotation_separator', dest='annotation_separator', default=';', help='separator character between annotations from different lines' )
@@ -108,12 +109,14 @@
       refFile = open(options.reference,'r')
       for tn,fline in enumerate(refFile):
         fields = fline.split('\t')
-        target_string =fields[refcol]
+        target_string = fields[refcol].rstrip('\r\n')
         if options.ignore_case:
           target_string = target_string.upper()
+        search = search_string if not options.reverse_find else target_string
+        target = target_string if not options.reverse_find else search_string
         if options.debug: 
-          print >> sys.stderr, "in: %s %s %s" % (search_string,search_string in target_string,target_string)
-        if search_string in target_string:
+          print >> sys.stderr, "in: %s %s %s" % (search,search in target,target)
+        if search in target:
           found = True
           if annotate:
             annotation = options.annotation_col_sep.join([fields[i] for i in annotation_columns])
--- a/find_in_reference.xml	Wed Feb 05 08:12:47 2014 -0500
+++ b/find_in_reference.xml	Wed Aug 13 15:01:33 2014 -0500
@@ -1,5 +1,5 @@
 <?xml version="1.0"?>
-<tool id="find_in_reference" name="find in reference" version="0.0.1">
+<tool id="find_in_reference" name="find in reference" version="0.0.2">
   <description>filter peptides that are present in proteins</description>
   <command interpreter="python">find_in_reference.py  --input "$input" 
   --reference "$reference" 
@@ -7,7 +7,7 @@
     --input_column $column.input_column
     --reference_column $column.reference_column
   #end if
-  $case_insensitive
+  $case_insensitive $reverse_find
   #if 'novel' in $outputs.__str__ or not 'found' in $outputs.__str__:
     --output "$novel"
   #end if
@@ -43,6 +43,7 @@
       </when>
     </conditional>
     <param name="case_insensitive" type="boolean" truevalue="--case_insensitive" falsevalue="" checked="false" label="Ignore case when comparing"/>
+    <param name="reverse_find" type="boolean" truevalue="--reverse_find" falsevalue="" checked="false" label="reverse search: find the reference in the input" />
     <param name="outputs" type="select" multiple="true" display="checkboxes" label="Choose outputs">
       <option value="novel" selected="true">lines with no match in reference</option>
       <option value="found">lines with match in reference</option>
@@ -100,6 +101,13 @@
       <param name="reference" value="human_proteins.tabular" ftype="tabular" dbkey="hg19"/>
       <output name="novel" file="novel_peptides.tabular"/>
     </test>
+    <test>
+      <param name="input" value="human_proteins.tabular" ftype="tabular" dbkey="hg19"/>
+      <param name="reference" value="human_peptides.tabular" ftype="tabular" dbkey="hg19"/>
+      <param name="reverse_find" value="True"/>
+      <param name="outputs" value="found"/>
+      <output name="found" file="found_proteins.tabular"/>
+    </test>
   </tests>
   <help>
 **Find in Reference**
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/found_proteins.tabular	Wed Aug 13 15:01:33 2014 -0500
@@ -0,0 +1,3 @@
+sp|Q9BS26|ERP44_HUMAN Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1	MHPAVFLSLPDLRCSLLLLVTWVFTPVTTEITSLDTENIDEILNNADVALVNFYADWCRFSQMLHPIFEEASDVIKEEFPNENQVVFARVDCDQHSDIAQRYRISKYPTLKLFRNGMMMKREYRGQRSVKALADYIRQQKSDPIQEIRDLAEITTLDRSKRNIIGYFEQKDSDNYRVFERVANILHDDCAFLSAFGDVSKPERYSGDNIIYKPPGHSAPDMVYLGAMTNFDVTYNWIQDKCVPLVREITFENGEELTEEGLPFLILFHMKEDTESLEIFQNEVARQLISEKGTINFLHADCDKFRHPLLHIQKTPADCPVIAIDSFRHMYVFGDFKDVLIPGKLKQFVFDLHSGKLHREFHHGPDPTDTAPGEQAQDVASSPPESSFQKLAPSEYRYTLLRDRDEL
+sp|P06213|INSR_HUMAN Insulin receptor OS=Homo sapiens GN=INSR PE=1 SV=4	MATGGRRGAAAAPLLVAVAALLLGAAGHLYPGEVCPGMDIRNNLTRLHELENCSVIEGHLQILLMFKTRPEDFRDLSFPKLIMITDYLLLFRVYGLESLKDLFPNLTVIRGSRLFFNYALVIFEMVHLKELGLYNLMNITRGSVRIEKNNELCYLATIDWSRILDSVEDNYIVLNKDDNEECGDICPGTAKGKTNCPATVINGQFVERCWTHSHCQKVCPTICKSHGCTAEGLCCHSECLGNCSQPDDPTKCVACRNFYLDGRCVETCPPPYYHFQDWRCVNFSFCQDLHHKCKNSRRQGCHQYVIHNNKCIPECPSGYTMNSSNLLCTPCLGPCPKVCHLLEGEKTIDSVTSAQELRGCTVINGSLIINIRGGNNLAAELEANLGLIEEISGYLKIRRSYALVSLSFFRKLRLIRGETLEIGNYSFYALDNQNLRQLWDWSKHNLTITQGKLFFHYNPKLCLSEIHKMEEVSGTKGRQERNDIALKTNGDQASCENELLKFSYIRTSFDKILLRWEPYWPPDFRDLLGFMLFYKEAPYQNVTEFDGQDACGSNSWTVVDIDPPLRSNDPKSQNHPGWLMRGLKPWTQYAIFVKTLVTFSDERRTYGAKSDIIYVQTDATNPSVPLDPISVSNSSSQIILKWKPPSDPNGNITHYLVFWERQAEDSELFELDYCLKGLKLPSRTWSPPFESEDSQKHNQSEYEDSAGECCSCPKTDSQILKELEESSFRKTFEDYLHNVVFVPRKTSSGTGAEDPRPSRKRRSLGDVGNVTVAVPTVAAFPNTSSTSVPTSPEEHRPFEKVVNKESLVISGLRHFTGYRIELQACNQDTPEERCSVAAYVSARTMPEAKADDIVGPVTHEIFENNVVHLMWQEPKEPNGLIVLYEVSYRRYGDEELHLCVSRKHFALERGCRLRGLSPGNYSVRIRATSLAGNGSWTEPTYFYVTDYLDVPSNIAKIIIGPLIFVFLFSVVIGSIYLFLRKRQPDGPLGPLYASSNPEYLSASDVFPCSVYVPDEWEVSREKITLLRELGQGSFGMVYEGNARDIIKGEAETRVAVKTVNESASLRERIEFLNEASVMKGFTCHHVVRLLGVVSKGQPTLVVMELMAHGDLKSYLRSLRPEAENNPGRPPPTLQEMIQMAAEIADGMAYLNAKKFVHRDLAARNCMVAHDFTVKIGDFGMTRDIYETDYYRKGGKGLLPVRWMAPESLKDGVFTTSSDMWSFGVVLWEITSLAEQPYQGLSNEQVLKFVMDGGYLDQPDNCPERVTDLMRMCWQFNPKMRPTFLEIVNLLKDDLHPSFPEVSFFHSEENKAPESEELEMEFEDMENVPLDRSSHCQREEAGGRDGGSSLGFKRSYEEHIPYTHMNGGKKNGRILTLPRSNPS
+sp|P08100|OPSD_HUMAN Rhodopsin OS=Homo sapiens GN=RHO PE=1 SV=1	MNGTEGPNFYVPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIPEGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAIYNPVIYIMMNKQFRNCMLTTICCGKNPLGDDEASATVSKTETSQVAPA