comparison PEsortedSAM2readprofile.py @ 12:a3113043abb0 draft

Uploaded
author arkarachai-fungtammasan
date Sun, 24 Jul 2016 17:55:50 -0400
parents 07588b899c13
children
comparison
equal deleted inserted replaced
11:48b5f719e36a 12:a3113043abb0
3 import sys 3 import sys
4 from galaxy import eggs 4 from galaxy import eggs
5 import pkg_resources 5 import pkg_resources
6 pkg_resources.require( "bx-python" ) 6 pkg_resources.require( "bx-python" )
7 import bx.seq.twobit 7 import bx.seq.twobit
8 import re
8 9
9 ##output columns: read_name chr prefix_start prefix_end TR_start TR_end suffix_start suffix_end TR_length TR_sequence 10 ##output columns: read_name chr prefix_start prefix_end TR_start TR_end suffix_start suffix_end TR_length TR_sequence
10 11
12 def readlengthinfer(sequence):
13 total_readlength=0
14 countlist=['M','I','S','X','=']
15 not_countlist=['D','P','N','H']
16 matchs=re.findall(r"(\d+)([A-Z=])",sequence)
17 for match in matchs:
18 if match[1] in countlist:
19 total_readlength+=int(match[0])
20 return total_readlength
21
11 samf = open(sys.argv[1],'r') #assumes sam file is sorted by readname 22 samf = open(sys.argv[1],'r') #assumes sam file is sorted by readname
12 seq_path = sys.argv[2] #Path to the reference genome in 2bit format 23 seq_path = sys.argv[2] #Path to the reference genome in 2bit format
13 24
14 ##maxTRlength=int(sys.argv[4]) 25 ##maxTRlength=int(sys.argv[4])
15 ##maxoriginalreadlength=int(sys.argv[5]) 26 ##maxoriginalreadlength=int(sys.argv[5])
60 #if 'XT:A:U' not in read_elems or 'XT:A:U' not in mate_elems: #both read and it's mate need to be mapped uniquely 71 #if 'XT:A:U' not in read_elems or 'XT:A:U' not in mate_elems: #both read and it's mate need to be mapped uniquely
61 # continue 72 # continue
62 read_chr = read_elems[2] 73 read_chr = read_elems[2]
63 read_start = int(read_elems[3]) 74 read_start = int(read_elems[3])
64 read_cigar = read_elems[5] 75 read_cigar = read_elems[5]
65 if len(read_cigar.split('M')) != 2: #we want perfect matches only..cigar= <someInt>M 76 # if len(read_cigar.split('M')) != 2: #we want perfect matches only..cigar= <someInt>M
66 continue 77 # continue
67 read_len = int(read_cigar.split('M')[0]) 78 # read_len = int(read_cigar.split('M')[0])
79 read_len=readlengthinfer(read_cigar)
68 mate_chr = mate_elems[2] 80 mate_chr = mate_elems[2]
69 mate_start = int(mate_elems[3]) 81 mate_start = int(mate_elems[3])
70 mate_cigar = mate_elems[5] 82 mate_cigar = mate_elems[5]
71 if len(mate_cigar.split('M')) != 2: #we want perfect matches only..cigar= <someInt>M 83 # if len(mate_cigar.split('M')) != 2: #we want perfect matches only..cigar= <someInt>M
72 continue 84 # continue
73 mate_len = int(mate_cigar.split('M')[0]) 85 # mate_len = int(mate_cigar.split('M')[0])
86 mate_len=readlengthinfer(mate_cigar)
74 if read_chr != mate_chr: # check that they were mapped to the same chromosome 87 if read_chr != mate_chr: # check that they were mapped to the same chromosome
75 continue 88 continue
76 if abs(read_start - mate_start) > (maxoriginalreadlength+maxTRlength): 89 if abs(read_start - mate_start) > (maxoriginalreadlength+maxTRlength):
77 continue 90 continue
78 if read_start < mate_start: 91 if read_start < mate_start: