Mercurial > repos > dcouvin > mirureader
comparison MIRUReader/MIRUReader.py @ 0:f0e3646a4e45 draft
Uploaded
| author | dcouvin |
|---|---|
| date | Tue, 17 Aug 2021 19:15:15 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:f0e3646a4e45 |
|---|---|
| 1 #!/usr/bin/python3 | |
| 2 | |
| 3 #Copyright 2019 NUS pathogen genomics | |
| 4 #Written by Cheng Yee Tang (chengyee.tang@nus.edu.sg) | |
| 5 #former python tag !/usr/bin/env python | |
| 6 | |
| 7 import os | |
| 8 import sys | |
| 9 import gzip | |
| 10 import argparse | |
| 11 import pandas as pd | |
| 12 import statistics | |
| 13 import subprocess | |
| 14 from statistics import mode | |
| 15 from collections import Counter | |
| 16 | |
| 17 | |
| 18 #function to determine repeat number based on total number of mismatches in primer sequence | |
| 19 def chooseMode(name, table, CounterList): | |
| 20 maxcount = max(CounterList.values()) | |
| 21 repeatToCheck = [] | |
| 22 for k, v in CounterList.items(): | |
| 23 if v == maxcount: | |
| 24 repeatToCheck.append(k) | |
| 25 x = 0 | |
| 26 for i, j in table.items(): | |
| 27 if name in i: | |
| 28 x += 1 | |
| 29 mismatchDict = {} | |
| 30 for rp in repeatToCheck: | |
| 31 mismatchDict[rp] = 0 | |
| 32 for i in range(x): | |
| 33 string = name + '_' + str(i+1) | |
| 34 if table[string][1] in repeatToCheck: | |
| 35 mismatchDict[table[string][1]] += table[string][0] | |
| 36 checklist2 = [] | |
| 37 for m, n in mismatchDict.items(): | |
| 38 checklist2.append(n) | |
| 39 duplicates = '' | |
| 40 for item in checklist2: | |
| 41 if checklist2.count(item) > 1: | |
| 42 duplicates = 'yes' | |
| 43 finalMode = '' | |
| 44 if duplicates == 'yes': | |
| 45 finalMode = '/'.join(str(r) for min_value in (min(mismatchDict.values()),) for r in mismatchDict if mismatchDict[r]==min_value) | |
| 46 else: | |
| 47 finalMode = min(mismatchDict.keys(), key=(lambda k: mismatchDict[k])) | |
| 48 return finalMode | |
| 49 | |
| 50 ''' | |
| 51 Main function | |
| 52 ''' | |
| 53 | |
| 54 script_dir = os.path.dirname(os.path.abspath(sys.argv[0])) | |
| 55 MIRU_table = script_dir + "/MIRU_table" | |
| 56 MIRU_table_0580 = script_dir + "/MIRU_table_0580" | |
| 57 MIRU_primers = script_dir + "/MIRU_primers" | |
| 58 | |
| 59 parser = argparse.ArgumentParser() | |
| 60 main_group = parser.add_argument_group('Main options') | |
| 61 main_group.add_argument('-r', '--reads', required=True, help='input reads file in fastq/fasta format (required)') | |
| 62 main_group.add_argument('-p', '--prefix', required=True, help='sample ID (required)') | |
| 63 main_group.add_argument('--table', type=str, default=MIRU_table, help='allele calling table') | |
| 64 main_group.add_argument('--primers', type=str, default=MIRU_primers, help='primers sequences') | |
| 65 optional_group = parser.add_argument_group('Optional options') | |
| 66 optional_group.add_argument('--amplicons', help='provide output from primersearch and summarize MIRU profile directly', action='store_true') | |
| 67 optional_group.add_argument('--details', help='for inspection', action='store_true') | |
| 68 optional_group.add_argument('--nofasta', help='delete the fasta reads file generated if your reads are in fastq format', action='store_true') | |
| 69 args = parser.parse_args() | |
| 70 | |
| 71 if not os.path.exists(args.reads): | |
| 72 sys.exit('Error: ' + args.reads + ' is not found!') | |
| 73 | |
| 74 sample_prefix = args.prefix | |
| 75 sample_dir = os.path.dirname(os.path.abspath(args.reads)) | |
| 76 mismatch_allowed = 18 | |
| 77 psearchOut = sample_dir + '/' + sample_prefix + '.' + str(mismatch_allowed) + '.primersearch.out' | |
| 78 | |
| 79 df = pd.read_csv(MIRU_table, sep='\t') | |
| 80 df_0580 = pd.read_csv(MIRU_table_0580, sep='\t') | |
| 81 miru = [] | |
| 82 with open(args.primers) as primerFile: | |
| 83 for line in primerFile: | |
| 84 miru.append(line.split()[0]) | |
| 85 | |
| 86 #auto detect .fastq, .fastq.gz, .fasta, .fasta.gz | |
| 87 #convert fastq to fasta | |
| 88 | |
| 89 fastaReads = sample_dir + '/' + sample_prefix + '.fasta' | |
| 90 if not args.amplicons: | |
| 91 if '.fastq' in args.reads: | |
| 92 if '.gz' in args.reads: | |
| 93 tmpH = open(fastaReads, 'w') | |
| 94 p1 = subprocess.Popen(['zcat', args.reads], stdout=subprocess.PIPE) | |
| 95 subprocess_args1 = ['sed', '-n', '1~4s/^@/>/p;2~4p'] | |
| 96 subprocess.call(subprocess_args1, stdin=p1.stdout, stdout=tmpH) | |
| 97 tmpH.close() | |
| 98 else: | |
| 99 tmpH = open(fastaReads, 'w') | |
| 100 subprocess_args1 = ['sed', '-n', '1~4s/^@/>/p;2~4p', args.reads] | |
| 101 subprocess.call(subprocess_args1, stdout=tmpH) | |
| 102 tmpH.close() | |
| 103 elif '.fasta' in args.reads: | |
| 104 if '.gz' in args.reads: | |
| 105 with open(fastaReads, 'w') as f: | |
| 106 for line in gzip.open(args.reads, 'rb').readlines(): | |
| 107 f.write(line) | |
| 108 else: | |
| 109 fastaReads = args.reads | |
| 110 | |
| 111 if not args.amplicons: | |
| 112 try: | |
| 113 subprocess_args = ['primersearch', '-seqall', fastaReads, '-infile', args.primers, '-mismatchpercent', str(mismatch_allowed), '-outfile', psearchOut] | |
| 114 subprocess.call(subprocess_args) | |
| 115 except OSError: | |
| 116 print('OSError: primersearch command is not found.') | |
| 117 sys.exit() | |
| 118 | |
| 119 if not os.path.exists(psearchOut): | |
| 120 sys.exit('Error: ' + psearchOut + ' is not found!') | |
| 121 | |
| 122 lookup = {} | |
| 123 repeats = {} | |
| 124 with open(psearchOut, 'r') as infile: | |
| 125 for line in infile.read().splitlines(): | |
| 126 if line.startswith('Primer'): | |
| 127 col = line.split(' ') | |
| 128 loci = str(col[2]) | |
| 129 repeats.setdefault(loci, []) | |
| 130 elif (line.startswith('Amplimer') and len(line) < 12): | |
| 131 col = line.split(' ') | |
| 132 primerID = loci + '_' + str(col[1]) | |
| 133 lookup.setdefault(primerID, []) | |
| 134 mm = 0 | |
| 135 elif 'mismatches' in line: | |
| 136 mm += int(line.partition('with ')[2].rstrip(' mismatches')) | |
| 137 elif 'Amplimer length' in line: | |
| 138 field = line.split(':') | |
| 139 amplicon = int(field[1].strip(' ').rstrip(' bp')) | |
| 140 lookup.setdefault(primerID).append(mm) | |
| 141 if amplicon > 1828: | |
| 142 lookup.setdefault(primerID).append('NA') | |
| 143 elif loci == '0580': | |
| 144 if amplicon > df_0580[loci][25]: | |
| 145 lookup.setdefault(primerID).append('NA') | |
| 146 else: | |
| 147 for i in range(26): | |
| 148 if amplicon < df_0580[loci][i]: | |
| 149 if i != 0: | |
| 150 first = df_0580[loci][i-1] | |
| 151 second = df_0580[loci][i] | |
| 152 if abs(amplicon - first) > abs(amplicon - second): | |
| 153 repeats.setdefault(loci).append(df_0580['No.'][i]) | |
| 154 lookup.setdefault(primerID).append(df_0580['No.'][i]) | |
| 155 break | |
| 156 else: | |
| 157 repeats.setdefault(loci).append(df_0580['No.'][i-1]) | |
| 158 lookup.setdefault(primerID).append(df_0580['No.'][i-1]) | |
| 159 break | |
| 160 else: | |
| 161 repeats.setdefault(loci).append(0) | |
| 162 lookup.setdefault(primerID).append(0) | |
| 163 | |
| 164 else: | |
| 165 if amplicon > df[loci][15]: | |
| 166 lookup.setdefault(primerID).append('NA') | |
| 167 else: | |
| 168 for i in range(16): | |
| 169 if amplicon < df[loci][i]: | |
| 170 if i != 0: | |
| 171 first = df[loci][i-1] | |
| 172 second = df[loci][i] | |
| 173 if abs(amplicon - first) > abs(amplicon - second): | |
| 174 repeats.setdefault(loci).append(i) | |
| 175 lookup.setdefault(primerID).append(i) | |
| 176 break | |
| 177 else: | |
| 178 repeats.setdefault(loci).append(i-1) | |
| 179 lookup.setdefault(primerID).append(i-1) | |
| 180 break | |
| 181 else: | |
| 182 repeats.setdefault(loci).append(0) | |
| 183 lookup.setdefault(primerID).append(0) | |
| 184 | |
| 185 if args.details: | |
| 186 myLookUp = pd.DataFrame(columns=["loci", "hit_index", "repeat_no", "error_no"]) | |
| 187 for key, value in lookup.items(): | |
| 188 #example: lookup = {'0154_1':[2,4]} total no. of mismatches, repeat number | |
| 189 myLookUp = myLookUp.append({"loci":key.split("_")[0], "hit_index":int(key.split("_")[1]), "repeat_no":lookup[key][1], "error_no":lookup[key][0]}, ignore_index=True) | |
| 190 sortedLookUp = myLookUp.sort_values(by=["loci", "hit_index"]) | |
| 191 print(sortedLookUp.to_csv(sep='\t', index=False)) | |
| 192 for item in miru: | |
| 193 #array that used to determine repeat number | |
| 194 print(Counter(repeats[item])) | |
| 195 | |
| 196 miru_repeats = pd.DataFrame(columns = ['sample_prefix'] + miru, index = range(1)) | |
| 197 miru_repeats['sample_prefix'] = sample_prefix | |
| 198 for item in miru: | |
| 199 if repeats[item] != []: | |
| 200 try: | |
| 201 repeat = mode(repeats[item]) | |
| 202 miru_repeats[item][0] = repeat | |
| 203 except statistics.StatisticsError: | |
| 204 repeat = chooseMode(item, lookup, Counter(repeats[item])) | |
| 205 miru_repeats[item][0] = repeat | |
| 206 else: | |
| 207 miru_repeats[item][0] = "ND" | |
| 208 | |
| 209 if args.nofasta: | |
| 210 if ('.fastq' in args.reads) or ('.gz' in args.reads): | |
| 211 os.remove(fastaReads) | |
| 212 | |
| 213 print(miru_repeats.to_csv(sep='\t', index=False, header=True)) |
