Mercurial > repos > rnateam > rbpbench
changeset 1:b022c6591515 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/rna_tools/rbpbench commit f4a6b7942386dd6506275bc0ec6ec842bc58d5b0
author | rnateam |
---|---|
date | Sun, 03 Dec 2023 21:31:37 +0000 |
parents | 7dd2835ce566 |
children | 26c64157456b |
files | batch_table_wrapper.py tool-data/rbp_ids.catrapid.omics.v2.1.human.6plus.loc.sample tool-data/rbp_ids.catrapid.omics.v2.1.human.6plus.txt tool_data_table_conf.xml.sample tool_data_table_conf.xml.test |
diffstat | 5 files changed, 261 insertions(+), 503 deletions(-) [+] |
line wrap: on
line diff
--- a/batch_table_wrapper.py Sun Dec 03 12:51:54 2023 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,242 +0,0 @@ -#!/usr/bin/env python3 - -import argparse -import os -import re -import subprocess - - -############################################################################### - -def setup_argument_parser(): - """Setup argparse parser.""" - help_description = """ - Python wrapper for RBPBench Galaxy wrapper to work with collections of - input BED files (i.e. to process them with rbpbench batch). - """ - # Define argument parser. - p = argparse.ArgumentParser(add_help=False, - prog="batch_table_wrapper.py", - description=help_description, - formatter_class=argparse.MetavarTypeHelpFormatter) - - # Required arguments. - p.add_argument("-h", "--help", - action="help", - help="Print help message") - p.add_argument("--table", - dest="in_table", - type=str, - metavar='str', - required=True, - help="Input table file with data ID, method ID, RBP ID and file name (Galaxy element identifier in dataset collection) for each to be processed dataset by rbpbench batch") - p.add_argument("--paths", - dest="in_paths", - type=str, - metavar='str', - nargs='+', - required=True, - help="List of Galaxy BED file paths (--files path1 path2 .. )") - p.add_argument("--ids", - dest="in_ids", - type=str, - metavar='str', - nargs='+', - required=True, - help="List of Galaxy element identifiers, equal to the BED dataset names in the dataset collection (--ids id1 id2 .. )") - p.add_argument("--genome", - dest="in_genome", - type=str, - metavar='str', - required=True, - help="Genomic sequences file (currently supported formats: FASTA)") - p.add_argument("--out", - dest="out_folder", - type=str, - metavar='str', - required=True, - help="Batch results output folder") - # Optional batch arguments. - p.add_argument("--ext", - dest="ext_up_down", - type=str, - metavar='str', - default="0", - help="Up- and downstream extension of --in sites in nucleotides (nt). Set e.g. --ext 30 for 30 nt on both sides, or --ext 20,10 for different up- and downstream extension (default: 0)") - p.add_argument("--motif-db", - dest="motif_db", - type=int, - default=1, - choices=[1, 2, 3], - help="Motif database to use. 1: human RBP motifs full (259 RBPs, 605 motifs, human_v0.1), 2: human RBP motifs full (low frequencies not rounded, human_v0.1_no_round), 3: human RBP motifs eCLIP (107 RBPs, 316 motifs, human_eclip_v0.1) (default: 1)") - p.add_argument("--fimo-nt-freqs", - dest="fimo_nt_freqs", - type=str, - metavar='str', - default=False, - help="Provide FIMO nucleotide frequencies (FIMO option: --bifile) file (default: use internal frequencies file optimized for human transcripts)") - p.add_argument("--fimo-pval", - dest="fimo_pval", - type=float, - metavar='float', - default=0.001, - help="FIMO p-value threshold (FIMO option: --thresh) (default: 0.001)") - p.add_argument("--bed-score-col", - dest="bed_score_col", - type=int, - metavar='int', - default=5, - help="--in BED score column used for p-value calculations. BED score can be e.g. log2 fold change or -log10 p-value of the region (default: 5)") - p.add_argument("--unstranded", - dest="unstranded", - default=False, - action="store_true", - help="Set if --in BED regions are NOT strand-specific, i.e., to look for motifs on both strands of the provided regions. Note that the two strands of a region will still be counted as one region (change with --unstranded-ct) (default: False)") - p.add_argument("--unstranded-ct", - dest="unstranded_ct", - default=False, - action="store_true", - help="Count each --in region twice for RBP hit statistics when --unstranded is enabled. By default, two strands of one region are counted as one region for RBP hit statistics") - return p - - -############################################################################### - -if __name__ == '__main__': - - parser = setup_argument_parser() - args = parser.parse_args() - - assert os.path.exists(args.in_table), "--table file \"%s\" not found" % (args.in_file) - assert os.path.exists(args.in_genome), "--genome file \"%s\" not found" % (args.in_genome) - - c_paths = len(args.in_paths) - c_ids = len(args.in_ids) - assert c_paths == c_ids, "given # paths (--paths) != # ids (--ids) (%i != %i). Please provide one ID for each path" % (c_paths, c_ids) - - """ - Check given paths and IDs. - - """ - - # Paths. - paths_dic = {} - paths_list = [] - for path in args.in_paths: - assert os.path.exists(path), "--paths %s file not found" % (path) - if path not in paths_dic: - paths_dic[path] = 1 - else: - assert False, "--paths %s given > 1. Please provide unique paths" % (path) - paths_list.append(path) - - # IDs - ids_dic = {} - ids_list = [] - for id in args.in_ids: - if id not in ids_dic: - ids_dic[id] = 1 - else: - assert False, "--ids \"%s\" given > 1. Please provide unique element identifiers (dataset names) inside the dataset collection, in order to unambiguously assign element ID to file path" % (id) - ids_list.append(id) - - id2path_dic = {} - for idx, id in enumerate(ids_list): - path = paths_list[idx] - id2path_dic[id] = path - - """ - Read in table. - - Column format: - rbp_id method_id data_id dataset_name - - """ - - comb_ids_dic = {} - id_collect_dic = {} - id_collect_dic["rbp_id"] = [] - id_collect_dic["method_id"] = [] - id_collect_dic["data_id"] = [] - id_collect_dic["set_name"] = [] - id_collect_dic["path"] = [] # Galaxy file path. - - print("Read in --table ... ") - - with open(args.in_table) as f: - for line in f: - - if re.search("^#", line): - continue - - cols = line.strip().split("\t") - - assert len(cols) == 4, "line in --table with # cols != 4 (%i) encountered:%s" % (len(cols), line) - - rbp_id = cols[0] - method_id = cols[1] - data_id = cols[2] - set_name = cols[3] - - if rbp_id == "rbp_id": - continue - - comb_id = "%s,%s,%s,%s" % (rbp_id, method_id, data_id, set_name) - - if comb_id not in comb_ids_dic: - comb_ids_dic[comb_id] = 1 - else: - assert False, "data combination (\"%s\") appears > 1 in --table file. Please provide unique combinations for rbpbench batch calculation" % (comb_id) - - assert set_name in ids_dic, "given dataset name \"%s\" from --table not part of given --ids. Please provide dataset names present in dataset collection" % (set_name) - - id_collect_dic["rbp_id"].append(rbp_id) - id_collect_dic["method_id"].append(method_id) - id_collect_dic["data_id"].append(data_id) - id_collect_dic["set_name"].append(set_name) - id_collect_dic["path"].append(id2path_dic[set_name]) - - f.closed - - assert id_collect_dic["rbp_id"], "nothing read in from --table. Please provide non-empty table in correct format (columns: rbp_id method_id data_id dataset_name)" - - """ - Construct RBPBench batch call. - - """ - - batch_call = "rbpbench batch" - batch_call += " --out %s" % (args.out_folder) - batch_call += " --genome %s" % (args.in_genome) - batch_call += " --ext %s" % (args.ext_up_down) - batch_call += " --motif-db %i" % (args.motif_db) - if args.fimo_nt_freqs: - batch_call += " --fimo-nt-freqs %s" % (args.fimo_nt_freqs) - batch_call += " --fimo-pval %s" % (str(args.fimo_pval)) - batch_call += " --bed-score-col %i" % (args.bed_score_col) - if args.unstranded: - batch_call += " --unstranded" - if args.unstranded_ct: - batch_call += " --unstranded-ct" - - rbp_ids = (" ").join(id_collect_dic["rbp_id"]) - method_ids = (" ").join(id_collect_dic["method_id"]) - data_ids = (" ").join(id_collect_dic["data_id"]) - paths = (" ").join(id_collect_dic["path"]) - - batch_call += " --rbp-list %s" % (rbp_ids) - batch_call += " --method-list %s" % (method_ids) - batch_call += " --data-list %s" % (data_ids) - batch_call += " --bed %s" % (paths) - - """ - Execute RBPBench batch call. - """ - - print("") - print("EXECUTING CALL:\n%s" % (batch_call)) - output = subprocess.getoutput(batch_call) - print("") - print("RUN OUTPUT:\n%s" % (output)) - print("") - print("DONE.")
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/rbp_ids.catrapid.omics.v2.1.human.6plus.loc.sample Sun Dec 03 21:31:37 2023 +0000 @@ -0,0 +1,259 @@ +A1CF +ACIN1 +ACO1 +ADAR +AGGF1 +AGO1 +AGO2 +AKAP1 +AKAP8 +ANKHD1 +AUH +BCCIP +BOLL +BUD13 +CAPRIN1 +CBX7 +CDC40 +CELF1 +CELF2 +CELF4 +CELF5 +CELF6 +CNBP +CNOT4 +CPEB1 +CPEB2 +CPEB4 +CPSF6 +CPSF7 +CSTF2 +CSTF2T +DAZ3 +DAZAP1 +DDX3X +DDX6 +DDX19B +DDX24 +DDX54 +DDX55 +DDX58 +DDX59 +DGCR8 +DKC1 +EFTUD2 +EIF3D +EIF4A3 +EIF4B +EIF4G2 +ELAVL1 +ELAVL2 +ELAVL3 +ELAVL4 +ENOX1 +ERI1 +ESRP1 +ESRP2 +EWSR1 +FAM120A +FASTKD2 +FBL +FIP1L1 +FMR1 +FTO +FUBP1 +FUBP3 +FUS +FXR1 +FXR2 +G3BP1 +G3BP2 +GNL3 +GPKOW +GRSF1 +GRWD1 +GTF2F1 +HLTF +HNRNPA0 +HNRNPA1 +HNRNPA1L2 +HNRNPA2B1 +HNRNPA3 +HNRNPAB +HNRNPC +HNRNPCL1 +HNRNPD +HNRNPDL +HNRNPF +HNRNPH1 +HNRNPH2 +HNRNPK +HNRNPL +HNRNPLL +HNRNPM +HNRNPU +HNRNPUL1 +IFIH1 +IGF2BP1 +IGF2BP2 +IGF2BP3 +IGHMBP2 +ILF2 +ILF3 +KHDRBS1 +KHDRBS2 +KHDRBS3 +KHSRP +LARP4 +LARP4B +LIN28A +LIN28B +LSM11 +MATR3 +MBNL1 +METAP2 +MSI1 +MSI2 +MTPAP +NCBP2 +NELFE +NKRF +NOL12 +NONO +NOP56 +NOP58 +NOVA1 +NOVA2 +NPM1 +NSUN2 +NUMA1 +NUP42 +NXF1 +OAS1 +OBI1 +PABPC1 +PABPC3 +PABPC4 +PABPC5 +PABPN1 +PABPN1L +PARP1 +PCBP1 +PCBP2 +PCBP4 +PPIE +PPIG +PPIL4 +PPRC1 +PRPF8 +PRR3 +PTBP1 +PTBP2 +PTBP3 +PUF60 +PUM1 +PUM2 +QKI +RALY +RALYL +RANGAP1 +RBFOX1 +RBFOX2 +RBFOX3 +RBM3 +RBM4 +RBM4B +RBM5 +RBM6 +RBM8A +RBM10 +RBM14 +RBM15 +RBM15B +RBM22 +RBM23 +RBM24 +RBM25 +RBM28 +RBM39 +RBM41 +RBM42 +RBM45 +RBM46 +RBM47 +RBMS1 +RBMS2 +RBMS3 +RBMX +RBMY1A1 +RC3H1 +TROVE2 +RPS5 +SAFB2 +SAMD4A +SART3 +SF1 +SF3A3 +SF3B4 +SFPQ +SLTM +SMNDC1 +SND1 +SNRNP70 +SNRPA +SNRPB2 +SOX2 +SRP14 +SRP68 +SRRM4 +SRSF1 +SRSF2 +SRSF3 +SRSF4 +SRSF5 +SRSF6 +SRSF7 +SRSF8 +SRSF9 +SRSF10 +SRSF11 +SSB +SUB1 +SUGP2 +SUPV3L1 +SYNCRIP +TAF15 +TARBP2 +TARDBP +TBRG4 +TIA1 +TIAL1 +TNRC6A +TRA2A +TRA2B +TRNAU1AP +TUT1 +U2AF1 +U2AF2 +UCHL5 +UNK +UPF1 +WDR33 +XPO5 +XRCC6 +XRN2 +YBX1 +YBX2 +YBX3 +YTHDC1 +YWHAG +ZC3H10 +ZCRB1 +ZFP36 +ZFP36L2 +ZNF184 +ZNF326 +ZNF622 +ZNF638 +ZRANB2 +SLBP \ No newline at end of file
--- a/tool-data/rbp_ids.catrapid.omics.v2.1.human.6plus.txt Sun Dec 03 12:51:54 2023 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,259 +0,0 @@ -A1CF -ACIN1 -ACO1 -ADAR -AGGF1 -AGO1 -AGO2 -AKAP1 -AKAP8 -ANKHD1 -AUH -BCCIP -BOLL -BUD13 -CAPRIN1 -CBX7 -CDC40 -CELF1 -CELF2 -CELF4 -CELF5 -CELF6 -CNBP -CNOT4 -CPEB1 -CPEB2 -CPEB4 -CPSF6 -CPSF7 -CSTF2 -CSTF2T -DAZ3 -DAZAP1 -DDX3X -DDX6 -DDX19B -DDX24 -DDX54 -DDX55 -DDX58 -DDX59 -DGCR8 -DKC1 -EFTUD2 -EIF3D -EIF4A3 -EIF4B -EIF4G2 -ELAVL1 -ELAVL2 -ELAVL3 -ELAVL4 -ENOX1 -ERI1 -ESRP1 -ESRP2 -EWSR1 -FAM120A -FASTKD2 -FBL -FIP1L1 -FMR1 -FTO -FUBP1 -FUBP3 -FUS -FXR1 -FXR2 -G3BP1 -G3BP2 -GNL3 -GPKOW -GRSF1 -GRWD1 -GTF2F1 -HLTF -HNRNPA0 -HNRNPA1 -HNRNPA1L2 -HNRNPA2B1 -HNRNPA3 -HNRNPAB -HNRNPC -HNRNPCL1 -HNRNPD -HNRNPDL -HNRNPF -HNRNPH1 -HNRNPH2 -HNRNPK -HNRNPL -HNRNPLL -HNRNPM -HNRNPU -HNRNPUL1 -IFIH1 -IGF2BP1 -IGF2BP2 -IGF2BP3 -IGHMBP2 -ILF2 -ILF3 -KHDRBS1 -KHDRBS2 -KHDRBS3 -KHSRP -LARP4 -LARP4B -LIN28A -LIN28B -LSM11 -MATR3 -MBNL1 -METAP2 -MSI1 -MSI2 -MTPAP -NCBP2 -NELFE -NKRF -NOL12 -NONO -NOP56 -NOP58 -NOVA1 -NOVA2 -NPM1 -NSUN2 -NUMA1 -NUP42 -NXF1 -OAS1 -OBI1 -PABPC1 -PABPC3 -PABPC4 -PABPC5 -PABPN1 -PABPN1L -PARP1 -PCBP1 -PCBP2 -PCBP4 -PPIE -PPIG -PPIL4 -PPRC1 -PRPF8 -PRR3 -PTBP1 -PTBP2 -PTBP3 -PUF60 -PUM1 -PUM2 -QKI -RALY -RALYL -RANGAP1 -RBFOX1 -RBFOX2 -RBFOX3 -RBM3 -RBM4 -RBM4B -RBM5 -RBM6 -RBM8A -RBM10 -RBM14 -RBM15 -RBM15B -RBM22 -RBM23 -RBM24 -RBM25 -RBM28 -RBM39 -RBM41 -RBM42 -RBM45 -RBM46 -RBM47 -RBMS1 -RBMS2 -RBMS3 -RBMX -RBMY1A1 -RC3H1 -TROVE2 -RPS5 -SAFB2 -SAMD4A -SART3 -SF1 -SF3A3 -SF3B4 -SFPQ -SLTM -SMNDC1 -SND1 -SNRNP70 -SNRPA -SNRPB2 -SOX2 -SRP14 -SRP68 -SRRM4 -SRSF1 -SRSF2 -SRSF3 -SRSF4 -SRSF5 -SRSF6 -SRSF7 -SRSF8 -SRSF9 -SRSF10 -SRSF11 -SSB -SUB1 -SUGP2 -SUPV3L1 -SYNCRIP -TAF15 -TARBP2 -TARDBP -TBRG4 -TIA1 -TIAL1 -TNRC6A -TRA2A -TRA2B -TRNAU1AP -TUT1 -U2AF1 -U2AF2 -UCHL5 -UNK -UPF1 -WDR33 -XPO5 -XRCC6 -XRN2 -YBX1 -YBX2 -YBX3 -YTHDC1 -YWHAG -ZC3H10 -ZCRB1 -ZFP36 -ZFP36L2 -ZNF184 -ZNF326 -ZNF622 -ZNF638 -ZRANB2 -SLBP
--- a/tool_data_table_conf.xml.sample Sun Dec 03 12:51:54 2023 +0000 +++ b/tool_data_table_conf.xml.sample Sun Dec 03 21:31:37 2023 +0000 @@ -7,6 +7,6 @@ <!-- IDs table file --> <table name="rbp_ids_table" comment_char="#"> <columns>value</columns> - <file path="${__HERE__}/tool-data/rbp_ids.catrapid.omics.v2.1.human.6plus.txt" /> + <file path="tool-data/rbp_ids.catrapid.omics.v2.1.human.6plus.loc" /> </table> </tables> \ No newline at end of file
--- a/tool_data_table_conf.xml.test Sun Dec 03 12:51:54 2023 +0000 +++ b/tool_data_table_conf.xml.test Sun Dec 03 21:31:37 2023 +0000 @@ -7,6 +7,6 @@ <!-- IDs table file --> <table name="rbp_ids_table" comment_char="#"> <columns>value</columns> - <file path="${__HERE__}/tool-data/rbp_ids.catrapid.omics.v2.1.human.6plus.txt" /> + <file path="${__HERE__}/tool-data/rbp_ids.catrapid.omics.v2.1.human.6plus.loc.sample" /> </table> </tables> \ No newline at end of file