changeset 3:f4cc06e92530 draft

Uploaded
author tyty
date Mon, 15 Sep 2014 14:52:20 -0400
parents 297cdb01d656
children a292aaf51735
files Iterative_mapping/.DS_Store Iterative_mapping/iterative_map.py Iterative_mapping/iterative_map.xml Iterative_mapping/map_ex.py Iterative_mapping/read_file.py Iterative_mapping/read_s_file.py Iterative_mapping/remove_map.py Iterative_mapping/seq_track.py Iterative_mapping/tool_dependencies.xml Iterative_mapping/truncate.py Iterative_mapping/unmap.py get_reads/get_read.py get_reads/get_read.xml get_reads/read_file.py get_reads/test.bam get_reads/tool_dependencies.xml
diffstat 16 files changed, 402 insertions(+), 153 deletions(-) [+]
line wrap: on
line diff
Binary file Iterative_mapping/.DS_Store has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Iterative_mapping/iterative_map.py	Mon Sep 15 14:52:20 2014 -0400
@@ -0,0 +1,103 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import sys
+import os
+from read_file import *
+from read_s_file import *
+
+type_input = sys.argv[1]
+seq_file = sys.argv[2]
+ref_file = sys.argv[3]
+shift = sys.argv[4]
+length = sys.argv[5]
+map_type = sys.argv[6]
+output_file = sys.argv[7]
+
+
+if map_type!="default":
+    s = ""
+    s = s+"-v "+sys.argv[8]
+    s = s+" -5 "+sys.argv[9]
+    s = s+" -3 "+sys.argv[10]
+    s = s+" -k "+sys.argv[11]
+    if sys.argv[12]:
+        s = s+" -a"
+    if int(sys.argv[13])>=1:
+        s = s+" -m "+sys.argv[13]
+    if sys.argv[14]:
+        s = s+" --best --strata "
+    
+else:
+    s = "-v 3 -a --best --strata "
+
+ospath = os.path.realpath(sys.argv[0])
+ost = ospath.split('/')
+syspath = ""
+for i in range(len(ost)-1):
+    syspath = syspath+ost[i].strip()
+    syspath = syspath+'/'
+
+os.system("bowtie-build -f "+ref_file+" "+syspath+"ref > "+syspath+"log.txt")
+
+os.system("cp "+seq_file+" "+syspath+"seq0.fa")
+
+if type_input == "fasta":
+    tp = 'fasta'
+if type_input == "fastq":
+    tp = 'fastq'
+
+k = 0
+print(type_input)
+while(True):
+    if type_input == "fasta":
+        os.system("bowtie "+s+"-f "+syspath+"ref"+" "+syspath+"seq"+str(k)+".fa --quiet -S > "+syspath+"map"+str(k)+".sam")
+    if type_input == "fastq":
+        os.system("bowtie "+s+"-q "+syspath+"ref"+" "+syspath+"seq"+str(k)+".fa --quiet -S > "+syspath+"map"+str(k)+".sam")
+    os.system("samtools view -Sb -F 0xfff "+syspath+"map"+str(k)+".sam > "+syspath+"mapped"+str(k)+".bam 2>"+syspath+"log.txt") #get mapped reads
+    os.system("samtools view -Sb -f 0x4 "+syspath+"map"+str(k)+".sam > "+syspath+"umapped"+str(k)+".bam 2>"+syspath+"log.txt") #get unmapped reads
+    os.system("samtools view -Sb -f 0x10 "+syspath+"map"+str(k)+".sam > "+syspath+"rmapped"+str(k)+".bam 2>"+syspath+"log.txt") #get reversed mapped reads
+    os.system("samtools merge -f "+syspath+"unmapped"+str(k)+".bam "+syspath+"umapped"+str(k)+".bam "+syspath+"rmapped"+str(k)+".bam") #get reversed mapped reads
+    os.system("samtools view -h -o "+syspath+"unmapped"+str(k)+".sam "+syspath+"unmapped"+str(k)+".bam") #get reversed mapped reads
+    if k>0:
+        os.system("samtools view -h -o "+syspath+"mapped"+str(k)+".sam "+syspath+"mapped"+str(k)+".bam") #get reversed mapped reads
+        os.system("cut -f 1 "+syspath+"unmapped"+str(k)+".sam > "+syspath+"unmapped"+str(k)+".txt")
+        os.system("cut -f 1 "+syspath+"mapped"+str(k)+".sam > "+syspath+"mapped"+str(k)+".txt")
+        os.system("python "+syspath+"remove_map.py "+syspath+"unmapped"+str(k)+".txt "+syspath+"mapped"+str(k)+".txt "+syspath+"runmapped"+str(k)+".txt")
+        os.system("rm "+syspath+"mapped"+str(k)+".sam")
+        os.system("rm "+syspath+"mapped"+str(k)+".txt")
+        os.system("rm "+syspath+"unmapped"+str(k)+".txt")
+    else:
+        os.system("cut -f 1 "+syspath+"unmapped"+str(k)+".sam > "+syspath+"runmapped"+str(k)+".txt")
+    
+    os.system("rm "+syspath+"unmapped"+str(k)+".bam")
+    os.system("rm "+syspath+"umapped"+str(k)+".bam")
+    os.system("rm "+syspath+"rmapped"+str(k)+".bam")
+    os.system("python "+syspath+"seq_track.py "+syspath+"runmapped"+str(k)+".txt "+syspath+"seq"+str(k)+".fa "+syspath+"unmap_seq"+str(k)+".fa "+tp) #get unmapped sequence
+    os.system("python "+syspath+"truncate.py "+syspath+"unmap_seq"+str(k)+".fa "+shift+" "+syspath+"seq"+str(k+1)+".fa "+length) #truncate unmapped sequence
+    os.system("rm "+syspath+"seq"+str(k)+".fa") #Remove sequences being mapped
+    os.system("rm "+syspath+"map"+str(k)+".sam") #Remove mapping file
+    os.system("rm "+syspath+"unmap_seq"+str(k)+".fa") #Remove unmapped sequnce
+    os.system("rm "+syspath+"runmapped"+str(k)+".txt")
+    os.system("rm "+syspath+"unmapped"+str(k)+".sam")
+    
+    os.system("wc -l "+syspath+"seq"+str(k+1)+".fa > "+syspath+"count"+str(k+1)+".txt")
+    c = read_sp_file(syspath+"count"+str(k+1)+".txt")
+    if c[0][0] == '0': #If no reads is in the sequence file, stop
+        os.system("rm "+syspath+"count"+str(k+1)+".txt")
+        os.system("rm "+syspath+"seq"+str(k+1)+".fa")
+        break
+    os.system("rm "+syspath+"count"+str(k+1)+".txt")
+    k = k+1
+
+ss = ""
+for i in range(0,k+1):
+    ss = ss+" "+syspath+"mapped"+str(i)+".bam"
+
+
+os.system("samtools merge -f "+output_file+" "+ss)
+#print("samtools merge mapped_all.bam"+ss)
+os.system("rm "+syspath+"mapped*.bam")
+os.system("rm "+syspath+"ref*")
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Iterative_mapping/iterative_map.xml	Mon Sep 15 14:52:20 2014 -0400
@@ -0,0 +1,80 @@
+<tool id="iterative_map_pipeline" name="Iterative mapping" version="1.0">
+	<description></description>
+	<command interpreter="python">
+        #if $mapping_file.type == "user"
+            iterative_map.py $file_format.type $file_format.seq_file $reference_file $shift $length $mapping_file.type $output $mapping_file.param_v $mapping_file.param_five $mapping_file.param_three $mapping_file.param_k $mapping_file.param_a $mapping_file.param_m $mapping_file.param_best  
+        #else
+            iterative_map.py $file_format.type $file_format.seq_file $reference_file $shift $length $mapping_file.type $output
+        #end if
+    </command>
+        <requirements>
+                <requirement type="package" version="1.61">biopython</requirement>
+                <requirement type="package" version="1.7">numpy</requirement>
+                <requirement type="package" version="0.1.18">samtools</requirement>
+                <requirement type="package" version="0.12.7">bowtie</requirement>
+        </requirements>
+	<inputs>
+                <conditional name="file_format">
+                  <param name="type" type="select" label="Format of the file of the reads (Default FASTQ)">
+                    <option value="fastq">FASTQ</option>
+                    <option value="fasta">FASTA</option>
+                  </param>
+                  <when value="fastq">
+                    <param name="seq_file" type="data" format="fastq" label="Fastq file"/>
+                  </when>
+                  <when value="fasta">
+                    <param name="seq_file" type="data" format="fasta" label="Fasta file"/>
+                  </when>
+                </conditional>
+		<param name="reference_file" type="data" format="fasta" label="Reference genome/transcriptome"/>
+                <param name="shift" type="integer" value="1" label="Number of nucleotide trimmed each round"/>
+                <param name="length" type="integer" value="21" label="Minimum requirement of read length for mapping"/>
+                <conditional name="mapping_file">
+                  <param name="type" type="select" label="Bowtie mapping flags (Default -v 0 -a --best --strata)">
+                    <option value="default">Default</option>
+                    <option value="user">User specified</option>
+                  </param>
+                  <when value="default"/>
+                  <when value="user"> 
+                    <param name="param_v" type="integer" value="0" label="Number of mismatches for SOAP-like alignment policy (-v)"/>
+                    <param name="param_five" type="integer" value="0" label="Trim n bases from high-quality (left) end of each read before alignment (-5)"/>
+                    <param name="param_three" type="integer" value="0" label="Trim n bases from high-quality (right) end of each read before alignment (-3)"/>
+                    <param name="param_k" type="integer" value="1" label="Report up to n valid alignments per read (-k)"/>
+                    <param name="param_a" type="boolean" checked="False" truevalue = "1" falsevalue = "0" label="Whether or not to report all valid alignments per read (-a)"/>
+                    <param name="param_m" type="integer" value="-1" label="Suppress all alignments for a read if more than n reportable alignments exist (-m), -1 for unlimited"/>
+                    <param name="param_best" type="boolean" checked="False" truevalue = "1" falsevalue = "0" label="Whether or not to make Bowtie guarantee that reported singleton alignments are 'best' in terms of stratum and in terms of the quality values at the mismatched positions (--best --strata)"/>
+                  </when>
+                </conditional>
+
+	</inputs>
+	<outputs>
+		<data name="output" type="data" format="bam"/>
+	</outputs>
+
+	<help>
+
+
+**TIPS**:
+
+-----
+
+**Input**:
+
+* 1. Sequence file type (FASTA/FASTQ)
+* 2. Sequence file (fasta/fastq format) {Default: fastq file}
+* 3. Reference file (e.g. cDNA library [fasta])
+* 4. “Shift” (The length of the sequence that will be trimmed at the 3’end of the reads before each round of mapping)
+* 5. “Length” (The minimum length of the reads for mapping after trimming)
+* [Optional]
+* 1. Bowtie mapping flags (options) [Default: -v 0 -a --best --strata] (-v flag indicates the number of allowed mismatches. use -5/-3 flag to trim nucleotides from 5'/3' end of the reads)
+
+-----
+
+**Output**:
+
+A bam file with all of the reads that are mapped	
+
+
+
+	</help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Iterative_mapping/map_ex.py	Mon Sep 15 14:52:20 2014 -0400
@@ -0,0 +1,31 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import sys
+from read_file import *
+from Bio import SeqIO
+
+map_file = sys.argv[1]
+result_file = sys.argv[2]
+
+
+#reads = read_t_file(read_file);
+
+f = open(map_file);
+h = file(result_file, 'w')
+
+for aline in f.readlines():
+    tline = aline.strip();
+    tl = tline.split('\t');
+    if len(tl)>4:
+        if int(tl[1].strip())== 0:
+            h.write(tline)
+            h.write('\n')
+
+
+f.close();
+h.close()
+
+
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Iterative_mapping/read_file.py	Mon Sep 15 14:52:20 2014 -0400
@@ -0,0 +1,21 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import sys
+
+
+
+def read_t_file(in_file):
+    f = open(in_file);
+    result = [];
+    for aline in f.readlines():
+        temp = [];
+        tline = aline.strip();
+        tl = tline.split('\t');
+        for i in range(0, len(tl)):
+            temp.append(tl[i].strip());
+        result.append(temp);
+    f.close();
+    return result;
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Iterative_mapping/read_s_file.py	Mon Sep 15 14:52:20 2014 -0400
@@ -0,0 +1,22 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import sys
+
+
+
+def read_sp_file(in_file):
+    f = open(in_file);
+    result = [];
+    for aline in f.readlines():
+        temp = [];
+        tline = aline.strip();
+        tl = tline.split(' ');
+        for i in range(0, len(tl)):
+            if len(tl[i].strip())>0:
+                temp.append(tl[i].strip());
+        result.append(temp);
+    f.close();
+    return result;
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Iterative_mapping/remove_map.py	Mon Sep 15 14:52:20 2014 -0400
@@ -0,0 +1,29 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import sys
+from read_file import *
+
+
+unmap_file = sys.argv[1]
+map_file = sys.argv[2]
+result_file = sys.argv[3]
+
+
+unmap = read_t_file(unmap_file)
+mapped = read_t_file(map_file)
+h = file(result_file, 'w')
+
+maps = set()
+for i in range(len(mapped)):
+    maps.add(mapped[i][0])
+
+
+for i in range(len(unmap)):
+    name = unmap[i][0]
+    if name not in maps:
+        h.write(name)
+        h.write('\n')
+
+
+h.close()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Iterative_mapping/seq_track.py	Mon Sep 15 14:52:20 2014 -0400
@@ -0,0 +1,38 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import sys
+from read_file import *
+from Bio import SeqIO
+
+unmap_file = sys.argv[1]
+reads_file = sys.argv[2]
+result_file = sys.argv[3]
+tp = sys.argv[4]
+
+
+unmap = read_t_file(unmap_file);
+
+h = file(result_file, 'w')
+
+reads = SeqIO.parse(reads_file,tp)
+um = set()
+for i in range(0, len(unmap)):
+    id_r = unmap[i][0]
+    um.add(id_r)
+
+for read in reads:
+    if read.id in um:
+        h.write('>')
+        h.write(read.id)
+        h.write('\n')
+        h.write(read.seq.tostring())
+        h.write('\n')
+    
+
+
+h.close()
+
+
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Iterative_mapping/tool_dependencies.xml	Mon Sep 15 14:52:20 2014 -0400
@@ -0,0 +1,15 @@
+<?xml version="1.0"?>
+<tool_dependency>
+    <package name="biopython" version="1.61">
+        <repository changeset_revision="ae9dda584395" name="package_biopython_1_61" owner="biopython" prior_installation_required="True" toolshed="http://toolshed.g2.bx.psu.edu" />
+    </package>
+    <package name="numpy" version="1.7">
+        <repository changeset_revision="ef12a3a11d5b" name="package_numpy_1_7" owner="iuc" prior_installation_required="False" toolshed="http://toolshed.g2.bx.psu.edu" />
+    </package>
+    <package name="samtools" version="0.1.18">
+        <repository changeset_revision="171cd8bc208d" name="package_samtools_0_1_18" owner="devteam" prior_installation_required="False" toolshed="http://toolshed.g2.bx.psu.edu" />
+    </package>
+    <package name="bowtie" version="0.12.7">
+        <repository changeset_revision="9f9f38617a98" name="package_bowtie_0_12_7" owner="devteam" prior_installation_required="False" toolshed="http://toolshed.g2.bx.psu.edu" />
+    </package>
+</tool_dependency>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Iterative_mapping/truncate.py	Mon Sep 15 14:52:20 2014 -0400
@@ -0,0 +1,32 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import sys
+from Bio import SeqIO
+
+fasta_file = sys.argv[1]
+shift_in = sys.argv[2]
+result_file = sys.argv[3]
+length = sys.argv[4]
+
+shift = int(shift_in)
+    
+fasta_sequences = SeqIO.parse(open(fasta_file),'fasta');
+h = file(result_file,'w')
+for seq in fasta_sequences:
+        nuc = seq.id;
+        sequence = seq.seq.tostring();
+        if (len(sequence)-shift)>=int(length):
+                h.write('>'+nuc)
+                h.write('\n')
+                h.write(sequence[0:(len(sequence)-shift)])
+                h.write('\n')
+
+
+
+
+h.close()
+
+
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Iterative_mapping/unmap.py	Mon Sep 15 14:52:20 2014 -0400
@@ -0,0 +1,31 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import sys
+from read_file import *
+from Bio import SeqIO
+
+map_file = sys.argv[1]
+result_file = sys.argv[2]
+
+
+#reads = read_t_file(read_file);
+
+f = open(map_file);
+h = file(result_file, 'w')
+
+for aline in f.readlines():
+    tline = aline.strip();
+    tl = tline.split('\t');
+    if len(tl)>4:
+        if int(tl[1].strip()) != 0:
+            h.write(tl[0].strip());
+            h.write('\n');
+
+
+f.close();
+h.close()
+
+
+
+
--- a/get_reads/get_read.py	Mon Sep 15 14:47:42 2014 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,77 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-import sys
-#from galaxy.tools.read_file import *
-from Bio import SeqIO
-import os
-from read_file import *
-
-fasta_file = sys.argv[1]
-map_file = sys.argv[2]
-result_file = sys.argv[3]
-
-os.system("samtools view -F 0xfff "+map_file+"|cut -f 3,4 > map_info.txt") 
-
-fasta_sequences = SeqIO.parse(open(fasta_file),'fasta');
-length_seq = {};
-for seq in fasta_sequences:
-        nuc = seq.id;
-        length_seq[nuc] = len(seq.seq.tostring());
-
-
-
-mapping = {}
-transcripts = []
-
-f = open("map_info.txt");
-for aline in f.readlines():
-    tline = aline.strip();
-    tl = tline.split('\t');
-    if tl[0].strip() not in transcripts:
-        transcripts.append(tl[0].strip());
-        mapping[tl[0].strip()] = [];
-
-    mapping[tl[0].strip()].append(tl[1].strip());
-
-distribution = {};
-coverage = {};
-for transcript in length_seq:
-    distribution[transcript] = [];
-    for i in range(0, length_seq[transcript]):
-        distribution[transcript].append(0);
-    sum_count = float(0);
-    if transcript in mapping:
-        for j in range(0, len(mapping[transcript])):
-            index = mapping[transcript][j];
-            #count = reads[mapping[transcript][j][0]];
-            sum_count = sum_count + 1;
-            distribution[transcript][int(index)-1] = distribution[transcript][int(index)-1] + 1;
-            coverage[transcript] = float(sum_count)/float(length_seq[transcript]);
-    else:
-        coverage[transcript] = 0
-
-        
-        
-    
-
-h = file(result_file, 'w')
-for transcript in length_seq:
-    h.write(transcript);
-    h.write('\n')
-    for i in range(0, length_seq[transcript]):
-        h.write(str(distribution[transcript][i]))
-        h.write('\t')
-    h.write('\n')
-    h.write('\n')
-
-
-
-    
-
-f.close();
-h.close()
-
-
-
-
--- a/get_reads/get_read.xml	Mon Sep 15 14:47:42 2014 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,43 +0,0 @@
-<tool id="get_read_pipeline" name="Get RT stop counts" version="1.0">
-	<description></description>
-	<command interpreter="python">get_read.py $lib_file $map_file $output </command>
-        <requirements>
-                <requirement type="package" version="1.61">biopython</requirement>
-                <requirement type="package" version="0.1.18">samtools</requirement>
-        </requirements>
-	<inputs>
-                <param name="lib_file" type="data" format="fasta" label="Library file (fasta)"/>
-		<param name="map_file" type="data" format="bam" label="Mapped file"/>
-	</inputs>
-	<outputs>
-		<data name="output" format="txt"/>
-	</outputs>
-        <tests>
-          <test>
-            <param name="lib_file" value="test.bam" />
-	    <param name="map_file" value="com_rna.txt" />
-	    <output name="output" file="get_RT_stop_test.out" />
- 
-          </test>
-        </tests>
-
-	<help>
-
-
-**TIPS**:
-
------
-
-**Input**
-1. A mapped (bam) file from Bowtie (or any mapping program)
-2. Reference library sequences (fasta) used to map the reads
-
------
-
-**Output**:
-A text file with reverse transcription stop counts mapped to each nucleotide (RTSC file)	
-
-
-
-	</help>
-</tool>
--- a/get_reads/read_file.py	Mon Sep 15 14:47:42 2014 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,21 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-import sys
-
-
-
-def read_t_file(in_file):
-    f = open(in_file);
-    result = [];
-    for aline in f.readlines():
-        temp = [];
-        tline = aline.strip();
-        tl = tline.split('\t');
-        for i in range(0, len(tl)):
-            temp.append(tl[i].strip());
-        result.append(temp);
-    f.close();
-    return result;
-
-
Binary file get_reads/test.bam has changed
--- a/get_reads/tool_dependencies.xml	Mon Sep 15 14:47:42 2014 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,12 +0,0 @@
-<?xml version="1.0"?>
-<tool_dependency>
-    <package name="biopython" version="1.61">
-        <repository changeset_revision="ae9dda584395" name="package_biopython_1_61" owner="biopython" prior_installation_required="True" toolshed="http://toolshed.g2.bx.psu.edu" />
-    </package>
-    <package name="numpy" version="1.7">
-        <repository changeset_revision="ef12a3a11d5b" name="package_numpy_1_7" owner="iuc" prior_installation_required="False" toolshed="http://toolshed.g2.bx.psu.edu" />
-    </package>
-    <package name="samtools" version="0.1.18">
-        <repository changeset_revision="171cd8bc208d" name="package_samtools_0_1_18" owner="devteam" prior_installation_required="False" toolshed="http://toolshed.g2.bx.psu.edu" />
-    </package>
-</tool_dependency>