Mercurial > repos > tyty > structurefold
changeset 3:f4cc06e92530 draft
Uploaded
author | tyty |
---|---|
date | Mon, 15 Sep 2014 14:52:20 -0400 |
parents | 297cdb01d656 |
children | a292aaf51735 |
files | Iterative_mapping/.DS_Store Iterative_mapping/iterative_map.py Iterative_mapping/iterative_map.xml Iterative_mapping/map_ex.py Iterative_mapping/read_file.py Iterative_mapping/read_s_file.py Iterative_mapping/remove_map.py Iterative_mapping/seq_track.py Iterative_mapping/tool_dependencies.xml Iterative_mapping/truncate.py Iterative_mapping/unmap.py get_reads/get_read.py get_reads/get_read.xml get_reads/read_file.py get_reads/test.bam get_reads/tool_dependencies.xml |
diffstat | 16 files changed, 402 insertions(+), 153 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Iterative_mapping/iterative_map.py Mon Sep 15 14:52:20 2014 -0400 @@ -0,0 +1,103 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import sys +import os +from read_file import * +from read_s_file import * + +type_input = sys.argv[1] +seq_file = sys.argv[2] +ref_file = sys.argv[3] +shift = sys.argv[4] +length = sys.argv[5] +map_type = sys.argv[6] +output_file = sys.argv[7] + + +if map_type!="default": + s = "" + s = s+"-v "+sys.argv[8] + s = s+" -5 "+sys.argv[9] + s = s+" -3 "+sys.argv[10] + s = s+" -k "+sys.argv[11] + if sys.argv[12]: + s = s+" -a" + if int(sys.argv[13])>=1: + s = s+" -m "+sys.argv[13] + if sys.argv[14]: + s = s+" --best --strata " + +else: + s = "-v 3 -a --best --strata " + +ospath = os.path.realpath(sys.argv[0]) +ost = ospath.split('/') +syspath = "" +for i in range(len(ost)-1): + syspath = syspath+ost[i].strip() + syspath = syspath+'/' + +os.system("bowtie-build -f "+ref_file+" "+syspath+"ref > "+syspath+"log.txt") + +os.system("cp "+seq_file+" "+syspath+"seq0.fa") + +if type_input == "fasta": + tp = 'fasta' +if type_input == "fastq": + tp = 'fastq' + +k = 0 +print(type_input) +while(True): + if type_input == "fasta": + os.system("bowtie "+s+"-f "+syspath+"ref"+" "+syspath+"seq"+str(k)+".fa --quiet -S > "+syspath+"map"+str(k)+".sam") + if type_input == "fastq": + os.system("bowtie "+s+"-q "+syspath+"ref"+" "+syspath+"seq"+str(k)+".fa --quiet -S > "+syspath+"map"+str(k)+".sam") + os.system("samtools view -Sb -F 0xfff "+syspath+"map"+str(k)+".sam > "+syspath+"mapped"+str(k)+".bam 2>"+syspath+"log.txt") #get mapped reads + os.system("samtools view -Sb -f 0x4 "+syspath+"map"+str(k)+".sam > "+syspath+"umapped"+str(k)+".bam 2>"+syspath+"log.txt") #get unmapped reads + os.system("samtools view -Sb -f 0x10 "+syspath+"map"+str(k)+".sam > "+syspath+"rmapped"+str(k)+".bam 2>"+syspath+"log.txt") #get reversed mapped reads + os.system("samtools merge -f "+syspath+"unmapped"+str(k)+".bam "+syspath+"umapped"+str(k)+".bam "+syspath+"rmapped"+str(k)+".bam") #get reversed mapped reads + os.system("samtools view -h -o "+syspath+"unmapped"+str(k)+".sam "+syspath+"unmapped"+str(k)+".bam") #get reversed mapped reads + if k>0: + os.system("samtools view -h -o "+syspath+"mapped"+str(k)+".sam "+syspath+"mapped"+str(k)+".bam") #get reversed mapped reads + os.system("cut -f 1 "+syspath+"unmapped"+str(k)+".sam > "+syspath+"unmapped"+str(k)+".txt") + os.system("cut -f 1 "+syspath+"mapped"+str(k)+".sam > "+syspath+"mapped"+str(k)+".txt") + os.system("python "+syspath+"remove_map.py "+syspath+"unmapped"+str(k)+".txt "+syspath+"mapped"+str(k)+".txt "+syspath+"runmapped"+str(k)+".txt") + os.system("rm "+syspath+"mapped"+str(k)+".sam") + os.system("rm "+syspath+"mapped"+str(k)+".txt") + os.system("rm "+syspath+"unmapped"+str(k)+".txt") + else: + os.system("cut -f 1 "+syspath+"unmapped"+str(k)+".sam > "+syspath+"runmapped"+str(k)+".txt") + + os.system("rm "+syspath+"unmapped"+str(k)+".bam") + os.system("rm "+syspath+"umapped"+str(k)+".bam") + os.system("rm "+syspath+"rmapped"+str(k)+".bam") + os.system("python "+syspath+"seq_track.py "+syspath+"runmapped"+str(k)+".txt "+syspath+"seq"+str(k)+".fa "+syspath+"unmap_seq"+str(k)+".fa "+tp) #get unmapped sequence + os.system("python "+syspath+"truncate.py "+syspath+"unmap_seq"+str(k)+".fa "+shift+" "+syspath+"seq"+str(k+1)+".fa "+length) #truncate unmapped sequence + os.system("rm "+syspath+"seq"+str(k)+".fa") #Remove sequences being mapped + os.system("rm "+syspath+"map"+str(k)+".sam") #Remove mapping file + os.system("rm "+syspath+"unmap_seq"+str(k)+".fa") #Remove unmapped sequnce + os.system("rm "+syspath+"runmapped"+str(k)+".txt") + os.system("rm "+syspath+"unmapped"+str(k)+".sam") + + os.system("wc -l "+syspath+"seq"+str(k+1)+".fa > "+syspath+"count"+str(k+1)+".txt") + c = read_sp_file(syspath+"count"+str(k+1)+".txt") + if c[0][0] == '0': #If no reads is in the sequence file, stop + os.system("rm "+syspath+"count"+str(k+1)+".txt") + os.system("rm "+syspath+"seq"+str(k+1)+".fa") + break + os.system("rm "+syspath+"count"+str(k+1)+".txt") + k = k+1 + +ss = "" +for i in range(0,k+1): + ss = ss+" "+syspath+"mapped"+str(i)+".bam" + + +os.system("samtools merge -f "+output_file+" "+ss) +#print("samtools merge mapped_all.bam"+ss) +os.system("rm "+syspath+"mapped*.bam") +os.system("rm "+syspath+"ref*") + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Iterative_mapping/iterative_map.xml Mon Sep 15 14:52:20 2014 -0400 @@ -0,0 +1,80 @@ +<tool id="iterative_map_pipeline" name="Iterative mapping" version="1.0"> + <description></description> + <command interpreter="python"> + #if $mapping_file.type == "user" + iterative_map.py $file_format.type $file_format.seq_file $reference_file $shift $length $mapping_file.type $output $mapping_file.param_v $mapping_file.param_five $mapping_file.param_three $mapping_file.param_k $mapping_file.param_a $mapping_file.param_m $mapping_file.param_best + #else + iterative_map.py $file_format.type $file_format.seq_file $reference_file $shift $length $mapping_file.type $output + #end if + </command> + <requirements> + <requirement type="package" version="1.61">biopython</requirement> + <requirement type="package" version="1.7">numpy</requirement> + <requirement type="package" version="0.1.18">samtools</requirement> + <requirement type="package" version="0.12.7">bowtie</requirement> + </requirements> + <inputs> + <conditional name="file_format"> + <param name="type" type="select" label="Format of the file of the reads (Default FASTQ)"> + <option value="fastq">FASTQ</option> + <option value="fasta">FASTA</option> + </param> + <when value="fastq"> + <param name="seq_file" type="data" format="fastq" label="Fastq file"/> + </when> + <when value="fasta"> + <param name="seq_file" type="data" format="fasta" label="Fasta file"/> + </when> + </conditional> + <param name="reference_file" type="data" format="fasta" label="Reference genome/transcriptome"/> + <param name="shift" type="integer" value="1" label="Number of nucleotide trimmed each round"/> + <param name="length" type="integer" value="21" label="Minimum requirement of read length for mapping"/> + <conditional name="mapping_file"> + <param name="type" type="select" label="Bowtie mapping flags (Default -v 0 -a --best --strata)"> + <option value="default">Default</option> + <option value="user">User specified</option> + </param> + <when value="default"/> + <when value="user"> + <param name="param_v" type="integer" value="0" label="Number of mismatches for SOAP-like alignment policy (-v)"/> + <param name="param_five" type="integer" value="0" label="Trim n bases from high-quality (left) end of each read before alignment (-5)"/> + <param name="param_three" type="integer" value="0" label="Trim n bases from high-quality (right) end of each read before alignment (-3)"/> + <param name="param_k" type="integer" value="1" label="Report up to n valid alignments per read (-k)"/> + <param name="param_a" type="boolean" checked="False" truevalue = "1" falsevalue = "0" label="Whether or not to report all valid alignments per read (-a)"/> + <param name="param_m" type="integer" value="-1" label="Suppress all alignments for a read if more than n reportable alignments exist (-m), -1 for unlimited"/> + <param name="param_best" type="boolean" checked="False" truevalue = "1" falsevalue = "0" label="Whether or not to make Bowtie guarantee that reported singleton alignments are 'best' in terms of stratum and in terms of the quality values at the mismatched positions (--best --strata)"/> + </when> + </conditional> + + </inputs> + <outputs> + <data name="output" type="data" format="bam"/> + </outputs> + + <help> + + +**TIPS**: + +----- + +**Input**: + +* 1. Sequence file type (FASTA/FASTQ) +* 2. Sequence file (fasta/fastq format) {Default: fastq file} +* 3. Reference file (e.g. cDNA library [fasta]) +* 4. “Shift” (The length of the sequence that will be trimmed at the 3’end of the reads before each round of mapping) +* 5. “Length” (The minimum length of the reads for mapping after trimming) +* [Optional] +* 1. Bowtie mapping flags (options) [Default: -v 0 -a --best --strata] (-v flag indicates the number of allowed mismatches. use -5/-3 flag to trim nucleotides from 5'/3' end of the reads) + +----- + +**Output**: + +A bam file with all of the reads that are mapped + + + + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Iterative_mapping/map_ex.py Mon Sep 15 14:52:20 2014 -0400 @@ -0,0 +1,31 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import sys +from read_file import * +from Bio import SeqIO + +map_file = sys.argv[1] +result_file = sys.argv[2] + + +#reads = read_t_file(read_file); + +f = open(map_file); +h = file(result_file, 'w') + +for aline in f.readlines(): + tline = aline.strip(); + tl = tline.split('\t'); + if len(tl)>4: + if int(tl[1].strip())== 0: + h.write(tline) + h.write('\n') + + +f.close(); +h.close() + + + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Iterative_mapping/read_file.py Mon Sep 15 14:52:20 2014 -0400 @@ -0,0 +1,21 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import sys + + + +def read_t_file(in_file): + f = open(in_file); + result = []; + for aline in f.readlines(): + temp = []; + tline = aline.strip(); + tl = tline.split('\t'); + for i in range(0, len(tl)): + temp.append(tl[i].strip()); + result.append(temp); + f.close(); + return result; + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Iterative_mapping/read_s_file.py Mon Sep 15 14:52:20 2014 -0400 @@ -0,0 +1,22 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import sys + + + +def read_sp_file(in_file): + f = open(in_file); + result = []; + for aline in f.readlines(): + temp = []; + tline = aline.strip(); + tl = tline.split(' '); + for i in range(0, len(tl)): + if len(tl[i].strip())>0: + temp.append(tl[i].strip()); + result.append(temp); + f.close(); + return result; + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Iterative_mapping/remove_map.py Mon Sep 15 14:52:20 2014 -0400 @@ -0,0 +1,29 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import sys +from read_file import * + + +unmap_file = sys.argv[1] +map_file = sys.argv[2] +result_file = sys.argv[3] + + +unmap = read_t_file(unmap_file) +mapped = read_t_file(map_file) +h = file(result_file, 'w') + +maps = set() +for i in range(len(mapped)): + maps.add(mapped[i][0]) + + +for i in range(len(unmap)): + name = unmap[i][0] + if name not in maps: + h.write(name) + h.write('\n') + + +h.close()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Iterative_mapping/seq_track.py Mon Sep 15 14:52:20 2014 -0400 @@ -0,0 +1,38 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import sys +from read_file import * +from Bio import SeqIO + +unmap_file = sys.argv[1] +reads_file = sys.argv[2] +result_file = sys.argv[3] +tp = sys.argv[4] + + +unmap = read_t_file(unmap_file); + +h = file(result_file, 'w') + +reads = SeqIO.parse(reads_file,tp) +um = set() +for i in range(0, len(unmap)): + id_r = unmap[i][0] + um.add(id_r) + +for read in reads: + if read.id in um: + h.write('>') + h.write(read.id) + h.write('\n') + h.write(read.seq.tostring()) + h.write('\n') + + + +h.close() + + + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Iterative_mapping/tool_dependencies.xml Mon Sep 15 14:52:20 2014 -0400 @@ -0,0 +1,15 @@ +<?xml version="1.0"?> +<tool_dependency> + <package name="biopython" version="1.61"> + <repository changeset_revision="ae9dda584395" name="package_biopython_1_61" owner="biopython" prior_installation_required="True" toolshed="http://toolshed.g2.bx.psu.edu" /> + </package> + <package name="numpy" version="1.7"> + <repository changeset_revision="ef12a3a11d5b" name="package_numpy_1_7" owner="iuc" prior_installation_required="False" toolshed="http://toolshed.g2.bx.psu.edu" /> + </package> + <package name="samtools" version="0.1.18"> + <repository changeset_revision="171cd8bc208d" name="package_samtools_0_1_18" owner="devteam" prior_installation_required="False" toolshed="http://toolshed.g2.bx.psu.edu" /> + </package> + <package name="bowtie" version="0.12.7"> + <repository changeset_revision="9f9f38617a98" name="package_bowtie_0_12_7" owner="devteam" prior_installation_required="False" toolshed="http://toolshed.g2.bx.psu.edu" /> + </package> +</tool_dependency>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Iterative_mapping/truncate.py Mon Sep 15 14:52:20 2014 -0400 @@ -0,0 +1,32 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import sys +from Bio import SeqIO + +fasta_file = sys.argv[1] +shift_in = sys.argv[2] +result_file = sys.argv[3] +length = sys.argv[4] + +shift = int(shift_in) + +fasta_sequences = SeqIO.parse(open(fasta_file),'fasta'); +h = file(result_file,'w') +for seq in fasta_sequences: + nuc = seq.id; + sequence = seq.seq.tostring(); + if (len(sequence)-shift)>=int(length): + h.write('>'+nuc) + h.write('\n') + h.write(sequence[0:(len(sequence)-shift)]) + h.write('\n') + + + + +h.close() + + + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Iterative_mapping/unmap.py Mon Sep 15 14:52:20 2014 -0400 @@ -0,0 +1,31 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import sys +from read_file import * +from Bio import SeqIO + +map_file = sys.argv[1] +result_file = sys.argv[2] + + +#reads = read_t_file(read_file); + +f = open(map_file); +h = file(result_file, 'w') + +for aline in f.readlines(): + tline = aline.strip(); + tl = tline.split('\t'); + if len(tl)>4: + if int(tl[1].strip()) != 0: + h.write(tl[0].strip()); + h.write('\n'); + + +f.close(); +h.close() + + + +
--- a/get_reads/get_read.py Mon Sep 15 14:47:42 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,77 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -import sys -#from galaxy.tools.read_file import * -from Bio import SeqIO -import os -from read_file import * - -fasta_file = sys.argv[1] -map_file = sys.argv[2] -result_file = sys.argv[3] - -os.system("samtools view -F 0xfff "+map_file+"|cut -f 3,4 > map_info.txt") - -fasta_sequences = SeqIO.parse(open(fasta_file),'fasta'); -length_seq = {}; -for seq in fasta_sequences: - nuc = seq.id; - length_seq[nuc] = len(seq.seq.tostring()); - - - -mapping = {} -transcripts = [] - -f = open("map_info.txt"); -for aline in f.readlines(): - tline = aline.strip(); - tl = tline.split('\t'); - if tl[0].strip() not in transcripts: - transcripts.append(tl[0].strip()); - mapping[tl[0].strip()] = []; - - mapping[tl[0].strip()].append(tl[1].strip()); - -distribution = {}; -coverage = {}; -for transcript in length_seq: - distribution[transcript] = []; - for i in range(0, length_seq[transcript]): - distribution[transcript].append(0); - sum_count = float(0); - if transcript in mapping: - for j in range(0, len(mapping[transcript])): - index = mapping[transcript][j]; - #count = reads[mapping[transcript][j][0]]; - sum_count = sum_count + 1; - distribution[transcript][int(index)-1] = distribution[transcript][int(index)-1] + 1; - coverage[transcript] = float(sum_count)/float(length_seq[transcript]); - else: - coverage[transcript] = 0 - - - - - -h = file(result_file, 'w') -for transcript in length_seq: - h.write(transcript); - h.write('\n') - for i in range(0, length_seq[transcript]): - h.write(str(distribution[transcript][i])) - h.write('\t') - h.write('\n') - h.write('\n') - - - - - -f.close(); -h.close() - - - -
--- a/get_reads/get_read.xml Mon Sep 15 14:47:42 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,43 +0,0 @@ -<tool id="get_read_pipeline" name="Get RT stop counts" version="1.0"> - <description></description> - <command interpreter="python">get_read.py $lib_file $map_file $output </command> - <requirements> - <requirement type="package" version="1.61">biopython</requirement> - <requirement type="package" version="0.1.18">samtools</requirement> - </requirements> - <inputs> - <param name="lib_file" type="data" format="fasta" label="Library file (fasta)"/> - <param name="map_file" type="data" format="bam" label="Mapped file"/> - </inputs> - <outputs> - <data name="output" format="txt"/> - </outputs> - <tests> - <test> - <param name="lib_file" value="test.bam" /> - <param name="map_file" value="com_rna.txt" /> - <output name="output" file="get_RT_stop_test.out" /> - - </test> - </tests> - - <help> - - -**TIPS**: - ------ - -**Input** -1. A mapped (bam) file from Bowtie (or any mapping program) -2. Reference library sequences (fasta) used to map the reads - ------ - -**Output**: -A text file with reverse transcription stop counts mapped to each nucleotide (RTSC file) - - - - </help> -</tool>
--- a/get_reads/read_file.py Mon Sep 15 14:47:42 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,21 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -import sys - - - -def read_t_file(in_file): - f = open(in_file); - result = []; - for aline in f.readlines(): - temp = []; - tline = aline.strip(); - tl = tline.split('\t'); - for i in range(0, len(tl)): - temp.append(tl[i].strip()); - result.append(temp); - f.close(); - return result; - -
--- a/get_reads/tool_dependencies.xml Mon Sep 15 14:47:42 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,12 +0,0 @@ -<?xml version="1.0"?> -<tool_dependency> - <package name="biopython" version="1.61"> - <repository changeset_revision="ae9dda584395" name="package_biopython_1_61" owner="biopython" prior_installation_required="True" toolshed="http://toolshed.g2.bx.psu.edu" /> - </package> - <package name="numpy" version="1.7"> - <repository changeset_revision="ef12a3a11d5b" name="package_numpy_1_7" owner="iuc" prior_installation_required="False" toolshed="http://toolshed.g2.bx.psu.edu" /> - </package> - <package name="samtools" version="0.1.18"> - <repository changeset_revision="171cd8bc208d" name="package_samtools_0_1_18" owner="devteam" prior_installation_required="False" toolshed="http://toolshed.g2.bx.psu.edu" /> - </package> -</tool_dependency>