# HG changeset patch # User triasteran # Date 1655817728 0 # Node ID 701804f5ad4b3335cdf4fe2f356fbf93b6beb2d1 # Parent be394fb4725069b085f58c3c537e4034a6362bfe Uploaded diff -r be394fb47250 -r 701804f5ad4b UMI_riboseq_processing/UMI.py --- a/UMI_riboseq_processing/UMI.py Tue Jun 21 09:50:49 2022 +0000 +++ b/UMI_riboseq_processing/UMI.py Tue Jun 21 13:22:08 2022 +0000 @@ -4,48 +4,56 @@ from sys import argv, exit import itertools from itertools import zip_longest - +import subprocess +from subprocess import call def grouper(iterable, n, fillvalue=None): args = [iter(iterable)] * n return zip_longest(*args, fillvalue=fillvalue) -chunk_size=4 +def is_gz_file(filepath): + with open(filepath, 'rb') as test_f: + return test_f.read(2) == b'\x1f\x8b' + -def copy_UMI_to_header_and_output_trimmed_read(pathToFastaFile, output): - # find wheather its plain or gzipped fastq - encoding = guess_type(pathToFastaFile)[1] # uses file extension - _open = partial(gzip.open, mode='rt') if encoding == 'gzip' else open - # output file will be in gz format - output = gzip.open(output,"wt") - # open and parse - with _open(pathToFastaFile) as f: - for lines in grouper(f, chunk_size, ""): - #lines = record.format('fastq').split('\n') # list of each record: id, seq, '+', quality - header = lines[0] - seq = lines[1] - sep = lines[2] - qual = lines[3] - # check if header is OK - if (header.startswith('@')): - trimmed_seq = seq[2:-6]+"\n" # fooprint + barcode - UMI = seq[0:2]+seq.rstrip()[-5:] #7nt in total; 5'NN and last 3'NNNNN - split_header = header.split(" ") - print (split_header) - new_header = split_header[0]+"_"+UMI+" "+split_header[1] - if qual[-1:] == "\n": - new_qual = qual[2:-6]+"\n" - else: - new_qual = qual[2:-6] - output.write(new_header) - output.write(trimmed_seq) - output.write(sep) - output.write(new_qual) - +def lines_parse(f, output_path): + output = open(output_path,"w") + for lines in grouper(f, 4, "\n"): + header = lines[0] + #print (header) + seq = lines[1] + sep = lines[2] + qual = lines[3] + # check if header is OK + if (header.startswith('@')): + trimmed_seq = seq[2:-6]+"\n" # fooprint + barcode + UMI = seq[0:2]+seq.rstrip()[-5:] #7nt in total; 5'NN and last 3'NNNNN + split_header = header.split(" ") + new_header = split_header[0]+"_"+UMI+" "+split_header[1] + if qual[-1:] == "\n": + new_qual = qual[2:-6]+"\n" + else: + new_qual = qual[2:-6] + output.write(new_header) + output.write(trimmed_seq) + output.write(sep) + output.write(new_qual) output.close() + +def UMI_processing(pathToFastaFile, output_path): + if is_gz_file(pathToFastaFile) == True: + with gzip.open(pathToFastaFile, 'rb') as file: + f = [x.decode("utf-8") for x in file.readlines()] + + else: + with open(pathToFastaFile, 'r') as file: + f = file.readlines() + + lines_parse(f, output_path) + def main(): if len(argv) != 3: exit("Usage: 2 arguments required\n1: Path to fasta file \n2: name of output file") @@ -53,7 +61,7 @@ # Get paths pathToFastaFile = argv[1] output = argv[2] - copy_UMI_to_header_and_output_trimmed_read(pathToFastaFile, output) + UMI_processing(pathToFastaFile, output) if __name__ == "__main__": main() diff -r be394fb47250 -r 701804f5ad4b UMI_riboseq_processing/UMI_riboseq.xml --- a/UMI_riboseq_processing/UMI_riboseq.xml Tue Jun 21 09:50:49 2022 +0000 +++ b/UMI_riboseq_processing/UMI_riboseq.xml Tue Jun 21 13:22:08 2022 +0000 @@ -1,4 +1,4 @@ - + @@ -15,6 +15,10 @@ + + + +