# HG changeset patch # User triasteran # Date 1655800364 0 # Node ID a580e700aac32dafe08bec5be51a14dc80ca139b # Parent d27375bc4a1cc4a5dc4f8022192408fe50909716 Uploaded diff -r d27375bc4a1c -r a580e700aac3 UMI_riboseq_processing/UMI.py --- a/UMI_riboseq_processing/UMI.py Mon Jun 20 08:02:35 2022 +0000 +++ b/UMI_riboseq_processing/UMI.py Tue Jun 21 08:32:44 2022 +0000 @@ -1,48 +1,46 @@ -import itertools +import gzip +from mimetypes import guess_type +from functools import partial +from Bio import SeqIO from sys import argv, exit -from itertools import zip_longest - -def grouper(iterable, n, fillvalue=None): - args = [iter(iterable)] * n - return zip_longest(*args, fillvalue=fillvalue) - -chunk_size=4 - - -def trimandpaste(pathToFastaFile, output): - #filename = pathToFastaFile.split('/')[-1] - output = open(output,"w") - with open(pathToFastaFile) as f: - for lines in grouper(f, chunk_size, ""): #for every chunk_sized chunk +def copy_UMI_to_header_and_output_trimmed_read(pathToFastaFile, output): + # find wheather its plain or gzipped fastq + encoding = guess_type(pathToFastaFile)[1] # uses file extension + _open = partial(gzip.open, mode='rt') if encoding == 'gzip' else open + # output file will be in gz format + output = gzip.open(output,"wt") + # open and parse + with _open(pathToFastaFile) as f: + for record in SeqIO.parse(f, 'fastq'): + lines = record.format('fastq').split('\n') # list of each record: id, seq, '+', quality header = lines[0] seq = lines[1] sep = lines[2] qual = lines[3] trimmed_seq = seq[2:-6]+"\n" # fooprint + barcode - UMI = seq[0:2]+seq.rstrip()[-5:] #7nt in total; 5'NN and last 3'NNNNN + UMI = seq[0:2]+seq.rstrip()[-5:] #7nt in total; 5'NN and last 3'NNNNN split_header = header.split(" ") new_header = split_header[0]+"_"+UMI+" "+split_header[1] if qual[-1:] == "\n": new_qual = qual[2:-6]+"\n" else: new_qual = qual[2:-6] - output.write(new_header) - output.write(trimmed_seq) - output.write(sep) - output.write(new_qual) + output.write(new_header+'\n') + output.write(trimmed_seq) + output.write(sep+'\n') + output.write(new_qual+'\n') - output.close() + output.close() def main(): - if len(argv) != 3: + if len(argv) != 3: exit("Usage: 2 arguments required\n1: Path to fasta file \n2: name of output file") # Get paths pathToFastaFile = argv[1] output = argv[2] - - trimandpaste(pathToFastaFile, output) + copy_UMI_to_header_and_output_trimmed_read(pathToFastaFile, output) if __name__ == "__main__": main() diff -r d27375bc4a1c -r a580e700aac3 UMI_riboseq_processing/UMI_riboseq.xml --- a/UMI_riboseq_processing/UMI_riboseq.xml Mon Jun 20 08:02:35 2022 +0000 +++ b/UMI_riboseq_processing/UMI_riboseq.xml Tue Jun 21 08:32:44 2022 +0000 @@ -1,5 +1,6 @@ - + + biopython @@ -17,7 +18,8 @@ - + @misc{FedorovaAD2022, author = {Fedorova Alla}, year = {2022}, title = {UMI_for_riboseq}, publisher = {GitHub}, journal = {GitHub repository}, url = {https://github.com/triasteran/RiboGalaxy_with_ansible/blob/main/toolshed_tools/UMI_riboseq_tool/UMI.py}, }