changeset 0:64d46676a13e draft

Uploaded convert_characters tarball.
author devteam
date Tue, 04 Dec 2012 10:55:58 -0500
parents
children b1cac85bdd7a
files convert_characters.py convert_characters.xml test-data/1.bed test-data/a.tab test-data/a.txt test-data/eq-convert.dat
diffstat 6 files changed, 260 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/convert_characters.py	Tue Dec 04 10:55:58 2012 -0500
@@ -0,0 +1,42 @@
+#!/usr/bin/env python
+#By, Guruprasad Ananda.
+
+from galaxy import eggs
+import sys, re
+
+def stop_err(msg):
+    sys.stderr.write(msg)
+    sys.exit()
+    
+def main():
+    if len(sys.argv) != 4:
+        stop_err("usage: convert_characters infile from_char outfile")
+
+    try:
+        fin = open(sys.argv[1],'r')
+    except:
+        stop_err("Input file cannot be opened for reading.")
+    
+    from_char = sys.argv[2]
+    
+    try:
+        fout = open(sys.argv[3],'w')
+    except:
+        stop_err("Output file cannot be opened for writing.")
+    
+    char_dict = {'T':'\t','s':'\s','Dt':'\.','C':',','D':'-','U':'_','P':'\|','Co':':'}
+    from_ch = char_dict[from_char] + '+'    #making an RE to match 1 or more occurences.
+    skipped = 0
+    
+    for line in fin:
+        line = line.strip()
+        try:
+            fout.write("%s\n" %(re.sub(from_ch,'\t',line)))     
+        except:
+            skipped += 1
+            
+    if skipped:
+        print "Skipped %d lines as invalid." %skipped
+    
+if __name__ == "__main__": 
+    main()
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/convert_characters.xml	Tue Dec 04 10:55:58 2012 -0500
@@ -0,0 +1,58 @@
+<tool id="Convert characters1" name="Convert">
+  <description>delimiters to TAB</description>
+  <command interpreter="python">convert_characters.py $input $convert_from $out_file1</command>
+  <inputs>
+    <param name="convert_from" type="select" label="Convert all">
+      <option value="s">Whitespaces</option>
+      <option value="T">Tabs</option>
+      <!--<option value="Sp">Spaces</option>-->
+      <option value="Dt">Dots</option>
+      <option value="C">Commas</option>
+      <option value="D">Dashes</option>
+      <option value="U">Underscores</option>
+      <option value="P">Pipes</option>
+      <option value="Co">Colons</option>
+    </param>
+    <param format="txt" name="input" type="data" label="in Query"/>
+  </inputs>
+  <outputs>
+    <data format="tabular" name="out_file1" />
+  </outputs>
+  <tests>
+    <test>
+      <param name="convert_from" value="s"/>
+      <param name="input" value="1.bed"/>
+      <output name="out_file1" file="eq-convert.dat"/>
+    </test>
+    <test>
+      <param name="convert_from" value="s"/>
+      <param name="input" value="a.txt"/>
+      <output name="out_file1" file="a.tab"/>
+    </test>
+  </tests>
+  <help>
+
+**What it does**
+
+Converts all delimiters of a specified type into TABs.  Consecutive characters are condensed. For example, if columns are separated by 5 spaces they will converted into 1 tab.
+
+-----
+
+**Example**
+
+- Input file::
+
+    chrX||151283558|151283724|NM_000808_exon_8_0_chrX_151283559_r|0|-
+    chrX|151370273|151370486|NM_000808_exon_9_0_chrX_151370274_r|0|-
+    chrX|151559494|151559583|NM_018558_exon_1_0_chrX_151559495_f|0|+
+    chrX|151564643|151564711|NM_018558_exon_2_0_chrX_151564644_f||||0|+
+
+- Converting all pipe delimiters of the above file to TABs will get::
+
+    chrX  151283558  151283724  NM_000808_exon_8_0_chrX_151283559_r  0  -
+    chrX  151370273  151370486  NM_000808_exon_9_0_chrX_151370274_r  0  -
+    chrX  151559494  151559583  NM_018558_exon_1_0_chrX_151559495_f  0  +
+    chrX  151564643  151564711  NM_018558_exon_2_0_chrX_151564644_f  0  +
+
+</help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/1.bed	Tue Dec 04 10:55:58 2012 -0500
@@ -0,0 +1,65 @@
+chr1	147962192	147962580	CCDS989.1_cds_0_0_chr1_147962193_r	0	-
+chr1	147984545	147984630	CCDS990.1_cds_0_0_chr1_147984546_f	0	+
+chr1	148078400	148078582	CCDS993.1_cds_0_0_chr1_148078401_r	0	-
+chr1	148185136	148185276	CCDS996.1_cds_0_0_chr1_148185137_f	0	+
+chr10	55251623	55253124	CCDS7248.1_cds_0_0_chr10_55251624_r	0	-
+chr11	116124407	116124501	CCDS8374.1_cds_0_0_chr11_116124408_r	0	-
+chr11	116206508	116206563	CCDS8377.1_cds_0_0_chr11_116206509_f	0	+
+chr11	116211733	116212337	CCDS8378.1_cds_0_0_chr11_116211734_r	0	-
+chr11	1812377	1812407	CCDS7726.1_cds_0_0_chr11_1812378_f	0	+
+chr12	38440094	38440321	CCDS8736.1_cds_0_0_chr12_38440095_r	0	-
+chr13	112381694	112381953	CCDS9526.1_cds_0_0_chr13_112381695_f	0	+
+chr14	98710240	98712285	CCDS9949.1_cds_0_0_chr14_98710241_r	0	-
+chr15	41486872	41487060	CCDS10096.1_cds_0_0_chr15_41486873_r	0	-
+chr15	41673708	41673857	CCDS10097.1_cds_0_0_chr15_41673709_f	0	+
+chr15	41679161	41679250	CCDS10098.1_cds_0_0_chr15_41679162_r	0	-
+chr15	41826029	41826196	CCDS10101.1_cds_0_0_chr15_41826030_f	0	+
+chr16	142908	143003	CCDS10397.1_cds_0_0_chr16_142909_f	0	+
+chr16	179963	180135	CCDS10401.1_cds_0_0_chr16_179964_r	0	-
+chr16	244413	244681	CCDS10402.1_cds_0_0_chr16_244414_f	0	+
+chr16	259268	259383	CCDS10403.1_cds_0_0_chr16_259269_r	0	-
+chr18	23786114	23786321	CCDS11891.1_cds_0_0_chr18_23786115_r	0	-
+chr18	59406881	59407046	CCDS11985.1_cds_0_0_chr18_59406882_f	0	+
+chr18	59455932	59456337	CCDS11986.1_cds_0_0_chr18_59455933_r	0	-
+chr18	59600586	59600754	CCDS11988.1_cds_0_0_chr18_59600587_f	0	+
+chr19	59068595	59069564	CCDS12866.1_cds_0_0_chr19_59068596_f	0	+
+chr19	59236026	59236146	CCDS12872.1_cds_0_0_chr19_59236027_r	0	-
+chr19	59297998	59298008	CCDS12877.1_cds_0_0_chr19_59297999_f	0	+
+chr19	59302168	59302288	CCDS12878.1_cds_0_0_chr19_59302169_r	0	-
+chr2	118288583	118288668	CCDS2120.1_cds_0_0_chr2_118288584_f	0	+
+chr2	118394148	118394202	CCDS2121.1_cds_0_0_chr2_118394149_r	0	-
+chr2	220190202	220190242	CCDS2441.1_cds_0_0_chr2_220190203_f	0	+
+chr2	220229609	220230869	CCDS2443.1_cds_0_0_chr2_220229610_r	0	-
+chr20	33330413	33330423	CCDS13249.1_cds_0_0_chr20_33330414_r	0	-
+chr20	33513606	33513792	CCDS13255.1_cds_0_0_chr20_33513607_f	0	+
+chr20	33579500	33579527	CCDS13256.1_cds_0_0_chr20_33579501_r	0	-
+chr20	33593260	33593348	CCDS13257.1_cds_0_0_chr20_33593261_f	0	+
+chr21	32707032	32707192	CCDS13614.1_cds_0_0_chr21_32707033_f	0	+
+chr21	32869641	32870022	CCDS13615.1_cds_0_0_chr21_32869642_r	0	-
+chr21	33321040	33322012	CCDS13620.1_cds_0_0_chr21_33321041_f	0	+
+chr21	33744994	33745040	CCDS13625.1_cds_0_0_chr21_33744995_r	0	-
+chr22	30120223	30120265	CCDS13897.1_cds_0_0_chr22_30120224_f	0	+
+chr22	30160419	30160661	CCDS13898.1_cds_0_0_chr22_30160420_r	0	-
+chr22	30665273	30665360	CCDS13901.1_cds_0_0_chr22_30665274_f	0	+
+chr22	30939054	30939266	CCDS13903.1_cds_0_0_chr22_30939055_r	0	-
+chr5	131424298	131424460	CCDS4149.1_cds_0_0_chr5_131424299_f	0	+
+chr5	131556601	131556672	CCDS4151.1_cds_0_0_chr5_131556602_r	0	-
+chr5	131621326	131621419	CCDS4152.1_cds_0_0_chr5_131621327_f	0	+
+chr5	131847541	131847666	CCDS4155.1_cds_0_0_chr5_131847542_r	0	-
+chr6	108299600	108299744	CCDS5061.1_cds_0_0_chr6_108299601_r	0	-
+chr6	108594662	108594687	CCDS5063.1_cds_0_0_chr6_108594663_f	0	+
+chr6	108640045	108640151	CCDS5064.1_cds_0_0_chr6_108640046_r	0	-
+chr6	108722976	108723115	CCDS5067.1_cds_0_0_chr6_108722977_f	0	+
+chr7	113660517	113660685	CCDS5760.1_cds_0_0_chr7_113660518_f	0	+
+chr7	116512159	116512389	CCDS5771.1_cds_0_0_chr7_116512160_r	0	-
+chr7	116714099	116714152	CCDS5773.1_cds_0_0_chr7_116714100_f	0	+
+chr7	116945541	116945787	CCDS5774.1_cds_0_0_chr7_116945542_r	0	-
+chr8	118881131	118881317	CCDS6324.1_cds_0_0_chr8_118881132_r	0	-
+chr9	128764156	128764189	CCDS6914.1_cds_0_0_chr9_128764157_f	0	+
+chr9	128787519	128789136	CCDS6915.1_cds_0_0_chr9_128787520_r	0	-
+chr9	128882427	128882523	CCDS6917.1_cds_0_0_chr9_128882428_f	0	+
+chr9	128937229	128937445	CCDS6919.1_cds_0_0_chr9_128937230_r	0	-
+chrX	122745047	122745924	CCDS14606.1_cds_0_0_chrX_122745048_f	0	+
+chrX	152648964	152649196	CCDS14733.1_cds_0_0_chrX_152648965_r	0	-
+chrX	152691446	152691471	CCDS14735.1_cds_0_0_chrX_152691447_f	0	+
+chrX	152694029	152694263	CCDS14736.1_cds_0_0_chrX_152694030_r	0	-
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/a.tab	Tue Dec 04 10:55:58 2012 -0500
@@ -0,0 +1,15 @@
+CHR	SNP	BP	A1	TEST	NMISS	BETA	STAT	P
+1	rs1181876	3671541	T	DOMDEV	958	-1.415	-3.326	0.0009161
+1	rs10492923	5092886	C	ADD	1007	5.105	4.368	1.382e-05
+1	rs10492923	5092886	C	DOMDEV	1007	-5.612	-4.249	2.35e-05
+1	rs10492923	5092886	C	GENO_2DF	1007	NA	19.9	4.775e-05
+1	rs1801133	11778965	T	ADD	1022	1.23	3.97	7.682e-05
+1	rs1801133	11778965	T	GENO_2DF	1022	NA	16.07	0.0003233
+1	rs1361912	12663121	A	ADD	1021	12.69	4.093	4.596e-05
+1	rs1361912	12663121	A	DOMDEV	1021	-12.37	-3.945	8.533e-05
+1	rs1361912	12663121	A	GENO_2DF	1021	NA	17.05	0.0001982
+1	rs1009806	19373138	G	ADD	1021	-1.334	-3.756	0.0001826
+1	rs1009806	19373138	G	GENO_2DF	1021	NA	19.36	6.244e-05
+1	rs873654	29550948	A	DOMDEV	1012	1.526	3.6	0.0003339
+1	rs10489527	36800027	C	ADD	1016	12.67	4.114	4.211e-05
+1	rs10489527	36800027	C	DOMDEV	1016	-13.05	-4.02	6.249e-05
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/a.txt	Tue Dec 04 10:55:58 2012 -0500
@@ -0,0 +1,15 @@
+ CHR        SNP         BP   A1       TEST    NMISS       BETA         STAT            P 
+   1  rs1181876    3671541    T     DOMDEV      958     -1.415       -3.326    0.0009161
+   1 rs10492923    5092886    C        ADD     1007      5.105        4.368    1.382e-05
+   1 rs10492923    5092886    C     DOMDEV     1007     -5.612       -4.249     2.35e-05
+   1 rs10492923    5092886    C   GENO_2DF     1007         NA         19.9    4.775e-05
+   1  rs1801133   11778965    T        ADD     1022       1.23         3.97    7.682e-05
+   1  rs1801133   11778965    T   GENO_2DF     1022         NA        16.07    0.0003233
+   1  rs1361912   12663121    A        ADD     1021      12.69        4.093    4.596e-05
+   1  rs1361912   12663121    A     DOMDEV     1021     -12.37       -3.945    8.533e-05
+   1  rs1361912   12663121    A   GENO_2DF     1021         NA        17.05    0.0001982
+   1  rs1009806   19373138    G        ADD     1021     -1.334       -3.756    0.0001826
+   1  rs1009806   19373138    G   GENO_2DF     1021         NA        19.36    6.244e-05
+   1   rs873654   29550948    A     DOMDEV     1012      1.526          3.6    0.0003339
+   1 rs10489527   36800027    C        ADD     1016      12.67        4.114    4.211e-05
+   1 rs10489527   36800027    C     DOMDEV     1016     -13.05        -4.02    6.249e-05
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/eq-convert.dat	Tue Dec 04 10:55:58 2012 -0500
@@ -0,0 +1,65 @@
+chr1	147962192	147962580	CCDS989.1_cds_0_0_chr1_147962193_r	0	-
+chr1	147984545	147984630	CCDS990.1_cds_0_0_chr1_147984546_f	0	+
+chr1	148078400	148078582	CCDS993.1_cds_0_0_chr1_148078401_r	0	-
+chr1	148185136	148185276	CCDS996.1_cds_0_0_chr1_148185137_f	0	+
+chr10	55251623	55253124	CCDS7248.1_cds_0_0_chr10_55251624_r	0	-
+chr11	116124407	116124501	CCDS8374.1_cds_0_0_chr11_116124408_r	0	-
+chr11	116206508	116206563	CCDS8377.1_cds_0_0_chr11_116206509_f	0	+
+chr11	116211733	116212337	CCDS8378.1_cds_0_0_chr11_116211734_r	0	-
+chr11	1812377	1812407	CCDS7726.1_cds_0_0_chr11_1812378_f	0	+
+chr12	38440094	38440321	CCDS8736.1_cds_0_0_chr12_38440095_r	0	-
+chr13	112381694	112381953	CCDS9526.1_cds_0_0_chr13_112381695_f	0	+
+chr14	98710240	98712285	CCDS9949.1_cds_0_0_chr14_98710241_r	0	-
+chr15	41486872	41487060	CCDS10096.1_cds_0_0_chr15_41486873_r	0	-
+chr15	41673708	41673857	CCDS10097.1_cds_0_0_chr15_41673709_f	0	+
+chr15	41679161	41679250	CCDS10098.1_cds_0_0_chr15_41679162_r	0	-
+chr15	41826029	41826196	CCDS10101.1_cds_0_0_chr15_41826030_f	0	+
+chr16	142908	143003	CCDS10397.1_cds_0_0_chr16_142909_f	0	+
+chr16	179963	180135	CCDS10401.1_cds_0_0_chr16_179964_r	0	-
+chr16	244413	244681	CCDS10402.1_cds_0_0_chr16_244414_f	0	+
+chr16	259268	259383	CCDS10403.1_cds_0_0_chr16_259269_r	0	-
+chr18	23786114	23786321	CCDS11891.1_cds_0_0_chr18_23786115_r	0	-
+chr18	59406881	59407046	CCDS11985.1_cds_0_0_chr18_59406882_f	0	+
+chr18	59455932	59456337	CCDS11986.1_cds_0_0_chr18_59455933_r	0	-
+chr18	59600586	59600754	CCDS11988.1_cds_0_0_chr18_59600587_f	0	+
+chr19	59068595	59069564	CCDS12866.1_cds_0_0_chr19_59068596_f	0	+
+chr19	59236026	59236146	CCDS12872.1_cds_0_0_chr19_59236027_r	0	-
+chr19	59297998	59298008	CCDS12877.1_cds_0_0_chr19_59297999_f	0	+
+chr19	59302168	59302288	CCDS12878.1_cds_0_0_chr19_59302169_r	0	-
+chr2	118288583	118288668	CCDS2120.1_cds_0_0_chr2_118288584_f	0	+
+chr2	118394148	118394202	CCDS2121.1_cds_0_0_chr2_118394149_r	0	-
+chr2	220190202	220190242	CCDS2441.1_cds_0_0_chr2_220190203_f	0	+
+chr2	220229609	220230869	CCDS2443.1_cds_0_0_chr2_220229610_r	0	-
+chr20	33330413	33330423	CCDS13249.1_cds_0_0_chr20_33330414_r	0	-
+chr20	33513606	33513792	CCDS13255.1_cds_0_0_chr20_33513607_f	0	+
+chr20	33579500	33579527	CCDS13256.1_cds_0_0_chr20_33579501_r	0	-
+chr20	33593260	33593348	CCDS13257.1_cds_0_0_chr20_33593261_f	0	+
+chr21	32707032	32707192	CCDS13614.1_cds_0_0_chr21_32707033_f	0	+
+chr21	32869641	32870022	CCDS13615.1_cds_0_0_chr21_32869642_r	0	-
+chr21	33321040	33322012	CCDS13620.1_cds_0_0_chr21_33321041_f	0	+
+chr21	33744994	33745040	CCDS13625.1_cds_0_0_chr21_33744995_r	0	-
+chr22	30120223	30120265	CCDS13897.1_cds_0_0_chr22_30120224_f	0	+
+chr22	30160419	30160661	CCDS13898.1_cds_0_0_chr22_30160420_r	0	-
+chr22	30665273	30665360	CCDS13901.1_cds_0_0_chr22_30665274_f	0	+
+chr22	30939054	30939266	CCDS13903.1_cds_0_0_chr22_30939055_r	0	-
+chr5	131424298	131424460	CCDS4149.1_cds_0_0_chr5_131424299_f	0	+
+chr5	131556601	131556672	CCDS4151.1_cds_0_0_chr5_131556602_r	0	-
+chr5	131621326	131621419	CCDS4152.1_cds_0_0_chr5_131621327_f	0	+
+chr5	131847541	131847666	CCDS4155.1_cds_0_0_chr5_131847542_r	0	-
+chr6	108299600	108299744	CCDS5061.1_cds_0_0_chr6_108299601_r	0	-
+chr6	108594662	108594687	CCDS5063.1_cds_0_0_chr6_108594663_f	0	+
+chr6	108640045	108640151	CCDS5064.1_cds_0_0_chr6_108640046_r	0	-
+chr6	108722976	108723115	CCDS5067.1_cds_0_0_chr6_108722977_f	0	+
+chr7	113660517	113660685	CCDS5760.1_cds_0_0_chr7_113660518_f	0	+
+chr7	116512159	116512389	CCDS5771.1_cds_0_0_chr7_116512160_r	0	-
+chr7	116714099	116714152	CCDS5773.1_cds_0_0_chr7_116714100_f	0	+
+chr7	116945541	116945787	CCDS5774.1_cds_0_0_chr7_116945542_r	0	-
+chr8	118881131	118881317	CCDS6324.1_cds_0_0_chr8_118881132_r	0	-
+chr9	128764156	128764189	CCDS6914.1_cds_0_0_chr9_128764157_f	0	+
+chr9	128787519	128789136	CCDS6915.1_cds_0_0_chr9_128787520_r	0	-
+chr9	128882427	128882523	CCDS6917.1_cds_0_0_chr9_128882428_f	0	+
+chr9	128937229	128937445	CCDS6919.1_cds_0_0_chr9_128937230_r	0	-
+chrX	122745047	122745924	CCDS14606.1_cds_0_0_chrX_122745048_f	0	+
+chrX	152648964	152649196	CCDS14733.1_cds_0_0_chrX_152648965_r	0	-
+chrX	152691446	152691471	CCDS14735.1_cds_0_0_chrX_152691447_f	0	+
+chrX	152694029	152694263	CCDS14736.1_cds_0_0_chrX_152694030_r	0	-