Mercurial > repos > boris > getalleleseq
comparison getalleleseq.py @ 8:698ede7baba9 draft
Uploaded
| author | boris |
|---|---|
| date | Tue, 18 Mar 2014 12:25:24 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 7:654b9e711967 | 8:698ede7baba9 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 # Boris Rebolledo-Jaramillo (boris-at-bx.psu.edu) | |
| 3 # | |
| 4 #usage: getalleleseq.py [-h] [-l INT] [-j FILE] [-d DIR] alleles | |
| 5 # | |
| 6 #Given a table with minor and major alleles per position, it generates the | |
| 7 #minor and major allele sequences in FASTA format | |
| 8 # | |
| 9 #positional arguments: | |
| 10 # alleles Table containing minor and major allele base per | |
| 11 # position. cols: [id, chr, pos, A, C, G, T, cvrg, | |
| 12 # plody, major, minor, freq_minor] | |
| 13 # | |
| 14 #optional arguments: | |
| 15 # -h, --help show this help message and exit | |
| 16 # -l INT, --seq-length INT | |
| 17 # Background sequence length. Bases in an artifical | |
| 18 # all-N-sequence of length INT will be replaced by | |
| 19 # either the major or minor allele base accordingly | |
| 20 # -j FILE, --major-seq FILE | |
| 21 # File to write major allele sequences in FASTA multiple | |
| 22 # alignment format. | |
| 23 # -d DIR, --minor-dir DIR | |
| 24 # Per sample minor allele sequences will be written to | |
| 25 # this directory | |
| 26 # | |
| 27 # The expected columns in the alleles table follow Nicholas Stoler's | |
| 28 # Variant Annotator tool format. See Variant Annotator in Galaxy's tool shed | |
| 29 # http://testtoolshed.g2.bx.psu.edu/repos/nick/allele_counts_1 for more details | |
| 30 # | |
| 31 # Expected columns: | |
| 32 # 1. sample_id | |
| 33 # 2. chr | |
| 34 # 3. position | |
| 35 # 4 counts for A's | |
| 36 # 5. counts for C's | |
| 37 # 6. counts for G's | |
| 38 # 7. counts for T's | |
| 39 # (8. counts for a's) | |
| 40 # (9. counts for c's) | |
| 41 # (10. counts for g's) | |
| 42 # (11. counts for t's) | |
| 43 # 8. (12.) Coverage | |
| 44 # 9. (13.) Number of alleles passing a given criteria | |
| 45 # 10. (14.) Major allele | |
| 46 # 11. (15.) Minor allele | |
| 47 # 12. (16.) Minor allele frequency in position | |
| 48 | |
| 49 import sys | |
| 50 import os | |
| 51 import argparse | |
| 52 | |
| 53 def createseq(sample, allele, seq_size, table): | |
| 54 """Generate major or minor allele sequence""" | |
| 55 out_sequence = ['N' for i in range(seq_size)] | |
| 56 sample_data = [line for line in table if line[0] == sample] | |
| 57 | |
| 58 for entry in sample_data: | |
| 59 position = int(entry[2]) | |
| 60 if len(entry)==12: | |
| 61 number_of_alleles = int(entry[8]) | |
| 62 major_allele = entry[9].strip() | |
| 63 minor_allele = entry[10].strip() | |
| 64 else: | |
| 65 number_of_alleles = int(entry[12]) | |
| 66 major_allele = entry[13].strip() | |
| 67 minor_allele = entry[14].strip() | |
| 68 | |
| 69 if allele == 'major': | |
| 70 out_sequence[position-1] = major_allele | |
| 71 elif allele == 'minor': | |
| 72 if number_of_alleles >= 2: | |
| 73 out_sequence[position-1] = minor_allele | |
| 74 else: | |
| 75 out_sequence[position-1] = major_allele | |
| 76 return out_sequence | |
| 77 | |
| 78 def printseq(sample,allele,seq,output): | |
| 79 """Print out sequence""" | |
| 80 #print >> output, '>{0}_{1}'.format(sample,allele) | |
| 81 print >> output, '>{0}{1}'.format(sample,allele) | |
| 82 for i in range(0,len(seq),70): | |
| 83 print >> output, ''.join(seq[i:i+70]) | |
| 84 | |
| 85 def main(): | |
| 86 parser = argparse.ArgumentParser(description='Given a table with minor and major alleles per position, it generates the minor and major allele sequences in FASTA format', epilog='Boris Rebolledo-Jaramillo (boris-at-bx.psu.edu)') | |
| 87 parser.add_argument('alleles', type=str, help='Table containing minor and major allele base per position. cols: [id, chr, pos, A, C, G, T, cvrg, plody, major, minor, freq_minor] ') | |
| 88 parser.add_argument('-l','--seq-length', type=int, metavar='INT', help='Background sequence length. Bases in an artifical all-N-sequence of length INT will be replaced by either the major or minor allele base accordingly') | |
| 89 parser.add_argument('-j','--major-seq', type=str, metavar='FILE', help='File to write major allele sequences in FASTA multiple alignment format.') | |
| 90 parser.add_argument('-d', '--minor-dir', type=str, metavar='DIR', default='.', help="Per sample minor allele sequences will be written to this directory (Default: current directory)") | |
| 91 parser.add_argument('-p', '--minor-prefix', type=str, metavar='STR', nargs='?', const='', default='', help=argparse.SUPPRESS) #Galaxy compatibility | |
| 92 args = parser.parse_args() | |
| 93 | |
| 94 | |
| 95 try: | |
| 96 table = [line.strip().split('\t') for line in list(open(args.alleles)) if "#" not in line] | |
| 97 samples = sorted(list(set([ line[0] for line in table ]))) | |
| 98 except: | |
| 99 sys.exit('\nERROR: Could not open %s\n' % args.alleles) | |
| 100 try: | |
| 101 major_out = open(args.major_seq, 'w+') | |
| 102 except: | |
| 103 sys.exit('\nCould not create %s\n' % args.major_seq) | |
| 104 | |
| 105 # Single file for all major allele sequences in FASTA multiple alignment | |
| 106 for sample in samples: | |
| 107 sequence = createseq(sample,'major',args.seq_length,table) | |
| 108 #printseq(sample,'major',sequence,major_out) | |
| 109 printseq(sample,'',sequence,major_out) | |
| 110 major_out.close() | |
| 111 | |
| 112 # Sample specific minor allele sequence in FASTA format | |
| 113 try: | |
| 114 os.makedirs(args.minor_dir) | |
| 115 except: | |
| 116 pass | |
| 117 | |
| 118 for sample in samples: | |
| 119 if args.minor_prefix: # to fit Galaxy requirements | |
| 120 name = sample.replace('_','') | |
| 121 minor_name = "%s_%s_%s" % ('primary',args.minor_prefix,name+'-minor_visible_fasta') | |
| 122 else: # for non-Galaxy | |
| 123 minor_name = sample+'-minor.fa' | |
| 124 minor_out = open(os.path.join(args.minor_dir, minor_name), 'w+') | |
| 125 sequence = createseq(sample,'minor',args.seq_length,table) | |
| 126 #printseq(sample,'minor',sequence,minor_out) | |
| 127 printseq(sample,'_minor',sequence,minor_out) | |
| 128 minor_out.close() | |
| 129 | |
| 130 if __name__ == "__main__": main() |
