Mercurial > repos > triasteran > ribogalaxy_umi_processing
changeset 4:a580e700aac3 draft
Uploaded
author | triasteran |
---|---|
date | Tue, 21 Jun 2022 08:32:44 +0000 |
parents | d27375bc4a1c |
children | e370df93715d |
files | UMI_riboseq_processing/UMI.py UMI_riboseq_processing/UMI_riboseq.xml |
diffstat | 2 files changed, 26 insertions(+), 26 deletions(-) [+] |
line wrap: on
line diff
--- a/UMI_riboseq_processing/UMI.py Mon Jun 20 08:02:35 2022 +0000 +++ b/UMI_riboseq_processing/UMI.py Tue Jun 21 08:32:44 2022 +0000 @@ -1,48 +1,46 @@ -import itertools +import gzip +from mimetypes import guess_type +from functools import partial +from Bio import SeqIO from sys import argv, exit -from itertools import zip_longest - -def grouper(iterable, n, fillvalue=None): - args = [iter(iterable)] * n - return zip_longest(*args, fillvalue=fillvalue) - -chunk_size=4 - - -def trimandpaste(pathToFastaFile, output): - #filename = pathToFastaFile.split('/')[-1] - output = open(output,"w") - with open(pathToFastaFile) as f: - for lines in grouper(f, chunk_size, ""): #for every chunk_sized chunk +def copy_UMI_to_header_and_output_trimmed_read(pathToFastaFile, output): + # find wheather its plain or gzipped fastq + encoding = guess_type(pathToFastaFile)[1] # uses file extension + _open = partial(gzip.open, mode='rt') if encoding == 'gzip' else open + # output file will be in gz format + output = gzip.open(output,"wt") + # open and parse + with _open(pathToFastaFile) as f: + for record in SeqIO.parse(f, 'fastq'): + lines = record.format('fastq').split('\n') # list of each record: id, seq, '+', quality header = lines[0] seq = lines[1] sep = lines[2] qual = lines[3] trimmed_seq = seq[2:-6]+"\n" # fooprint + barcode - UMI = seq[0:2]+seq.rstrip()[-5:] #7nt in total; 5'NN and last 3'NNNNN + UMI = seq[0:2]+seq.rstrip()[-5:] #7nt in total; 5'NN and last 3'NNNNN split_header = header.split(" ") new_header = split_header[0]+"_"+UMI+" "+split_header[1] if qual[-1:] == "\n": new_qual = qual[2:-6]+"\n" else: new_qual = qual[2:-6] - output.write(new_header) - output.write(trimmed_seq) - output.write(sep) - output.write(new_qual) + output.write(new_header+'\n') + output.write(trimmed_seq) + output.write(sep+'\n') + output.write(new_qual+'\n') - output.close() + output.close() def main(): - if len(argv) != 3: + if len(argv) != 3: exit("Usage: 2 arguments required\n1: Path to fasta file \n2: name of output file") # Get paths pathToFastaFile = argv[1] output = argv[2] - - trimandpaste(pathToFastaFile, output) + copy_UMI_to_header_and_output_trimmed_read(pathToFastaFile, output) if __name__ == "__main__": main()
--- a/UMI_riboseq_processing/UMI_riboseq.xml Mon Jun 20 08:02:35 2022 +0000 +++ b/UMI_riboseq_processing/UMI_riboseq.xml Tue Jun 21 08:32:44 2022 +0000 @@ -1,5 +1,6 @@ -<tool id="UMI_riboseq" name="move UMIs from reads to header" version="0.1.3"> +<tool id="UMI_riboseq" name="move UMIs from reads to header" version="0.1.4"> <requirements> + <requirement type="package" version="1.75">biopython</requirement> </requirements> <command detect_errors="exit_code"> <![CDATA[ python3 '$__tool_directory__/UMI.py' $reads $output ]]> @@ -17,7 +18,8 @@ </test> </tests> <help> -<![CDATA[ **fastq/fastq.gz** input files with reads containing both UMIs and barcodes but NO adapters. ----- **Output** The output of the script is fastq/fastq.gz file where UMIs (7nt in total, 5'NN and 3'NNNNN preceding barcode consisting of 5nt) are in header of the read; next step of processing is demultiplexing]]> +<![CDATA[ fastq/fastq.gz are input files with reads containing UMIs (already demultiplexed and adapters are removed). + The output of the script is fastq/fastq.gz file where UMIs (7nt in total, 5'NN and 3'NNNNN preceding barcode consisting of 5nt) are in header of the read. It is designed for protocol: McGlincy NJ, Ingolia NT. Transcriptome-wide measurement of translation by ribosome profiling. Methods. 2017;126:112-129. doi:10.1016/j.ymeth.2017.05.028 ]]> </help> <citations> <citation type="bibtex"> @misc{FedorovaAD2022, author = {Fedorova Alla}, year = {2022}, title = {UMI_for_riboseq}, publisher = {GitHub}, journal = {GitHub repository}, url = {https://github.com/triasteran/RiboGalaxy_with_ansible/blob/main/toolshed_tools/UMI_riboseq_tool/UMI.py}, }