Next changeset 1:1e9a43cbf524 (2022-06-15) |
Commit message:
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/prot-scriber commit 8b58f3f03d6430689d228029bb2eb46c16cfff23 |
added:
prot-scriber.xml test-data/8_Proteins_prot-scriber.out test-data/8_Proteins_vs_Swissprot_blastp.txt test-data/8_Proteins_vs_Trembl_blastp.txt test-data/blacklist_stitle_regexs.txt |
b |
diff -r 000000000000 -r c840a1c77a0a prot-scriber.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/prot-scriber.xml Tue May 10 13:18:05 2022 +0000 |
[ |
b'@@ -0,0 +1,310 @@\n+<tool id="prot_scriber" name="prot-scriber" version="@TOOL_VERSION@" profile="21.05">\n+ <description>Protein annotation of short human readable descriptions</description>\n+ <macros>\n+ <token name="@TOOL_VERSION@">0.1.1</token>\n+ </macros>\n+ <requirements>\n+ <requirement type="package" version="@TOOL_VERSION@">prot-scriber</requirement>\n+ </requirements>\n+ <stdio>\n+ <regex match="panicked" level="fatal" source="stderr" />\n+ </stdio>\n+ <command>\n+ <![CDATA[\'prot-scriber\'\n+ #if str($input_config.input_config_selector) == "basic"\n+ #for $sst in $input_config.seq_sim_table\n+ -s \'$sst\'\n+ #end for\n+ #else if str($input_config.input_config_selector) == "advanced"\n+ #for $ssr in $input_config.advanced_input_repeat\n+ -s \'$ssr.seq_sim_table\'\n+ #if $ssr.header\n+ -e \'$ssr.header\'\n+ #end if\n+ #if $ssr.field_separator\n+ -p \'$ssr.field_separator\'\n+ #end if\n+ #if $ssr.blacklist_regexs\n+ -b \'$ssr.blacklist_regexs\'\n+ #end if\n+ #if $ssr.capture_replace_pairs\n+ -c \'$ssr.capture_replace_pairs\'\n+ #end if\n+ #if $ssr.filter_regexs\n+ -l \'$ssr.filter_regexs\'\n+ #end if\n+ #end for \n+ #if $input_config.expert_options.non_informative_words_regexs\n+ -w \'$input_config.expert_options.non_informative_words_regexs\'\n+ #end if\n+ #if $input_config.expert_options.description_split_regex\n+ -r "$input_config.expert_options.description_split_regex"\n+ #end if\n+ #if $input_config.expert_options.center_inverse_word_information_content_at_quantile\n+ -q $input_config.expert_options.center_inverse_word_information_content_at_quantile\n+ #end if\n+ #end if\n+ #if $seq_family.seq_families\n+ -f \'$seq_families\'\n+ #end if\n+ #if $seq_family.annotate_non_family_queries\n+ -a\n+ #end if\n+ #if $seq_family.seq_family_gene_ids_separator\n+ -g "$seq_family_gene_ids_separator"\n+ #end if\n+ #if $seq_family.seq_family_id_genes_separator\n+ -i \'$seq_family_id_genes_separator\'\n+ #end if\n+ -o \'$output\'\n+ ]]>\n+ </command>\n+ <inputs>\n+ <conditional name="input_config">\n+ <param type="select" name="input_config_selector" label="Choose input configuration options">\n+ <option value="basic" selected="true">Basic</option>\n+ <option value="advanced">Advanced</option>\n+ </param>\n+ <when value="basic">\n+ <param type="data" multiple="true" name="seq_sim_table" argument="-s" format="tabular" label="Sequence similarity search results in tabular format (-s)" help="Files in which to find sequence similarity search results in tabular format (SSST). Use e.g. Blast or Diamond to produce them.\n+ Required columns are: \'qacc sacc stitle\' (Blast) or \'qseqid sseqid stitle\' (Diamond)." /> \n+ </when>\n+ <when value="advanced">\n+ <repeat name="advanced_input_repeat" title="Sequence similarity table" min="1" default="1">\n+ <param type="data" name="seq_sim_table" argument="-s" format="tabular" label="Sequence similarity search result in tabular format (-s)" help="File in which to find sequence similarity search results in tabular format (SSST). Use e.g. Blast or Diamond to produce them.\n+ Required columns are: \'qacc sacc stitle\' (Blast) or \'qseqid sseqid stitle\' (Diamond)." /> \n+ <param type="text" optional="true" name="field_separator" argument="-p" label="Field separator (-p)" help="Field-Separator of the (-s) sequence similarity table. The default value is the \'TAB\' character. Set to \'default\' to use the hard coded default">\n+ <sanitizer>\n+ <valid initial="default">\n+ <add preset="string.printable" />\n+ </valid>\n+ </sanitizer>\n+ </param>\n+ <param type="text" optional="true" name="header" argument="-e" label="Header of the sequence similarity tables (-e)" help="'..b"ere:\n+ https://raw.githubusercontent.com/usadellab/prot-scriber/master/misc/filter_stitle_regexs.txt.\n+\n+ -c, --capture-replace-pairs (Expert option)\n+ A file with pairs of lines. Within each pair the first line is a regular expressions\n+ defining one or more capture groups. The second line of a pair is the\n+ string used to replace the match in the regular expression with. This means the second\n+ line contains the capture groups. These pairs are used to further filter\n+ the sequence similarity search result descriptions ('stitle' in Blast terminology). In\n+ contrast to the --filter-regex (-l) matches are not deleted, but replaced with the\n+ second line of the pair. Filtering is used to process descriptions ('stitle' in Blast\n+ terminology) and prepare the descriptions for the prot-scriber annotation process.\n+ Set to 'default' to use the hard coded default. An example file can be downloaded here:\n+ https://raw.githubusercontent.com/usadellab/prot-scriber/master/misc/capture_replace_pairs.txt.\n+\n+----\n+\n+**Gene family annotation**\n+\n+prot-scriber can also apply the same methodology to produce HRDs for sets of biological sequences, i.e. gene families::\n+\n+ -f, --seq-families\n+ A file in which families of biological sequences are stored, one family per line. Each\n+ line must have format 'fam-name TAB gene1,gene2,gene3'. Make sure no gene appears in\n+ more than one family.\n+\n+ -g, --seq-family-gene-ids-separator \n+ A regular expression used to split the list of gene-identifiers in the\n+ argument --seq-families (-f) gene families file. Default is '(\\s*,\\s*|\\s+)'.\n+\n+ -a, --annotate-non-family-queries\n+ Use this option only in combination with --seq-families (-f), i.e. when prot-scriber is\n+ used to generate human readable descriptions for gene families. If in that context this\n+ flag is given, queries for which there are sequence similarity search (Blast) results\n+ but that are NOT member of a sequence family will receive an annotation (human readable\n+ description) in the output file, too. Default value of this setting is 'OFF' (false).\n+\n+----\n+\n+**Expert options**\n+\n+Some additional optional configuration. Only use when you know what you are doing::\n+\n+ -w, --non-informative-words-regexs\n+ A file in which regular expressions (regexs) are stored, one per line. These\n+ regexs are used to recognize non-informative words, which will only receive a minimun\n+ score in the prot-scriber process that generates human readable description. There is a\n+ default list hard-coded into prot-scriber. An example file can be downloaded here:\n+ https://raw.githubusercontent.com/usadellab/prot-scriber/master/misc/non_informative_words_regexs.txt.\n+\n+ -r, --description-split-regex\n+ A regular expression to be used to split descriptions (`stitle` in Blast\n+ terminology) into words. Default is '([~_\\-/|\\;,':.\\s]+)'.\n+\n+ -q, --center-inverse-word-information-content-at-quantile\n+ The quantile (percentile) to be subtracted from calculated inverse word information\n+ content to center these values. Consequently, this must be a value between zero and one\n+ or literal 50, which is interpreted as mean instead of a quantile. Default is 50,\n+ implying centering at the mean.\n+\n+----\n+\n+**Output**\n+\n+prot-scriber outputs a single tab-separated text file with the annotated sequences or gene-families, depending on how you ran the program, one result per line::\n+\n+ Annotee-Identifier\tHuman-Readable-Description\n+ Soltu.DM.02G020600.1\tarath strubbelig receptor family\n+ Soltu.DM.S001650.1\tgermin member\n+ Soltu.DM.03G011280.1\tincreased dna methylation\n+ ...\n+\n+ ]]>\n+ </help>\n+</tool>\n" |
b |
diff -r 000000000000 -r c840a1c77a0a test-data/8_Proteins_prot-scriber.out --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/8_Proteins_prot-scriber.out Tue May 10 13:18:05 2022 +0000 |
b |
@@ -0,0 +1,9 @@ +Annotee-Identifier Human-Readable-Description +Soltu.DM.02G020600.1 arath strubbelig receptor family +Soltu.DM.S001650.1 germin member +Soltu.DM.03G011280.1 increased dna methylation +Soltu.DM.02G015700.1 lrr receptor serine threonine kinase +Soltu.DM.07G016620.1 gdsl esterase lipase +Soltu.DM.04G035790.1 phosphatidylinositol phosphatidylcholine transfer sfh +Soltu.DM.01G022510.1 5 amp activated kinase subunit gamma +Soltu.DM.01G045390.1 hva |
b |
diff -r 000000000000 -r c840a1c77a0a test-data/8_Proteins_vs_Swissprot_blastp.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/8_Proteins_vs_Swissprot_blastp.txt Tue May 10 13:18:05 2022 +0000 |
b |
b'@@ -0,0 +1,136 @@\n+Soltu.DM.02G015700.1\tsp|Q1MX30|XA21_ORYSI\tsp|Q1MX30|XA21_ORYSI Receptor kinase-like protein Xa21 OS=Oryza sativa subsp. indica OX=39946 GN=XA21 PE=1 SV=1\n+Soltu.DM.02G015700.1\tsp|Q2R2D5|XA21_ORYSJ\tsp|Q2R2D5|XA21_ORYSJ Receptor kinase-like protein Xa21 OS=Oryza sativa subsp. japonica OX=39947 GN=XA21 PE=1 SV=1\n+Soltu.DM.02G015700.1\tsp|C0LGP4|Y3475_ARATH\tsp|C0LGP4|Y3475_ARATH Probable LRR receptor-like serine/threonine-protein kinase At3g47570 OS=Arabidopsis thaliana OX=3702 GN=At3g47570 PE=2 SV=1\n+Soltu.DM.02G015700.1\tsp|C0LGT6|EFR_ARATH\tsp|C0LGT6|EFR_ARATH LRR receptor-like serine/threonine-protein kinase EFR OS=Arabidopsis thaliana OX=3702 GN=EFR PE=1 SV=1\n+Soltu.DM.02G015700.1\tsp|Q9SD62|Y3471_ARATH\tsp|Q9SD62|Y3471_ARATH Putative receptor-like protein kinase At3g47110 OS=Arabidopsis thaliana OX=3702 GN=At3g47110 PE=3 SV=1\n+Soltu.DM.02G015700.1\tsp|Q9ZUI0|Y2241_ARATH\tsp|Q9ZUI0|Y2241_ARATH Putative leucine-rich repeat receptor-like serine/threonine-protein kinase At2g24130 OS=Arabidopsis thaliana OX=3702 GN=At2g24130 PE=3 SV=1\n+Soltu.DM.02G015700.1\tsp|Q9FL28|FLS2_ARATH\tsp|Q9FL28|FLS2_ARATH LRR receptor-like serine/threonine-protein kinase FLS2 OS=Arabidopsis thaliana OX=3702 GN=FLS2 PE=1 SV=1\n+Soltu.DM.02G015700.1\tsp|Q9SJG2|Y2296_ARATH\tsp|Q9SJG2|Y2296_ARATH Probable receptor-like protein kinase At2g42960 OS=Arabidopsis thaliana OX=3702 GN=At2g42960 PE=3 SV=1\n+Soltu.DM.02G015700.1\tsp|Q9LRP3|Y3174_ARATH\tsp|Q9LRP3|Y3174_ARATH Probable receptor-like protein kinase At3g17420 OS=Arabidopsis thaliana OX=3702 GN=At3g17420 PE=1 SV=1\n+Soltu.DM.02G015700.1\tsp|Q9LYN8|EMS1_ARATH\tsp|Q9LYN8|EMS1_ARATH Leucine-rich repeat receptor protein kinase EMS1 OS=Arabidopsis thaliana OX=3702 GN=EMS1 PE=1 SV=1\n+Soltu.DM.02G015700.1\tsp|Q3EDL4|Y1154_ARATH\tsp|Q3EDL4|Y1154_ARATH Probable serine/threonine-protein kinase At1g01540 OS=Arabidopsis thaliana OX=3702 GN=At1g01540 PE=1 SV=2\n+Soltu.DM.02G015700.1\tsp|C0LGF4|FEI1_ARATH\tsp|C0LGF4|FEI1_ARATH LRR receptor-like serine/threonine-protein kinase FEI 1 OS=Arabidopsis thaliana OX=3702 GN=FEI1 PE=1 SV=1\n+Soltu.DM.02G015700.1\tsp|C0LGQ5|GSO1_ARATH\tsp|C0LGQ5|GSO1_ARATH LRR receptor-like serine/threonine-protein kinase GSO1 OS=Arabidopsis thaliana OX=3702 GN=GSO1 PE=1 SV=1\n+Soltu.DM.02G015700.1\tsp|C0LGJ1|Y1743_ARATH\tsp|C0LGJ1|Y1743_ARATH Probable LRR receptor-like serine/threonine-protein kinase At1g74360 OS=Arabidopsis thaliana OX=3702 GN=At1g74360 PE=1 SV=1\n+Soltu.DM.02G015700.1\tsp|Q9FZB1|Y5188_ARATH\tsp|Q9FZB1|Y5188_ARATH Probable LRR receptor-like serine/threonine-protein kinase At1g51880 OS=Arabidopsis thaliana OX=3702 GN=At1g51880 PE=2 SV=1\n+Soltu.DM.02G015700.1\tsp|Q9SHI2|Y1723_ARATH\tsp|Q9SHI2|Y1723_ARATH Leucine-rich repeat receptor-like serine/threonine-protein kinase At1g17230 OS=Arabidopsis thaliana OX=3702 GN=At1g17230 PE=1 SV=2\n+Soltu.DM.02G015700.1\tsp|Q9C7S5|PSYR1_ARATH\tsp|Q9C7S5|PSYR1_ARATH Tyrosine-sulfated glycopeptide receptor 1 OS=Arabidopsis thaliana OX=3702 GN=PSY1R PE=1 SV=1\n+Soltu.DM.02G015700.1\tsp|O49564|CRK27_ARATH\tsp|O49564|CRK27_ARATH Cysteine-rich receptor-like protein kinase 27 OS=Arabidopsis thaliana OX=3702 GN=CRK27 PE=3 SV=2\n+Soltu.DM.02G015700.1\tsp|Q0JA29|FLS2_ORYSJ\tsp|Q0JA29|FLS2_ORYSJ LRR receptor-like serine/threonine-protein kinase FLS2 OS=Oryza sativa subsp. japonica OX=39947 GN=FLS2 PE=1 SV=1\n+Soltu.DM.02G015700.1\tsp|Q7G768|BRL2_ORYSJ\tsp|Q7G768|BRL2_ORYSJ Brassinosteroid LRR receptor kinase BRL2 OS=Oryza sativa subsp. japonica OX=39947 GN=BRL2 PE=2 SV=1\n+Soltu.DM.02G015700.1\tsp|Q9FGL5|CEPR1_ARATH\tsp|Q9FGL5|CEPR1_ARATH Receptor protein-tyrosine kinase CEPR1 OS=Arabidopsis thaliana OX=3702 GN=CEPR1 PE=1 SV=1\n+Soltu.DM.02G015700.1\tsp|Q8LPB4|PSKR1_DAUCA\tsp|Q8LPB4|PSKR1_DAUCA Phytosulfokine receptor 1 OS=Daucus carota OX=4039 GN=PSKR PE=1 SV=1\n+Soltu.DM.02G015700.1\tsp|C0LGL9|FEI2_ARATH\tsp|C0LGL9|FEI2_ARATH LRR receptor-like serine/threonine-protein kinase FEI 2 OS=Arabidopsis thaliana OX=3702 GN=FEI2 PE=1 SV=1\n+Soltu.DM.02G015700.1\tsp|O22476|BRI1_ARATH\tsp|O22476|BRI'..b'.01G045390.1\tsp|Q9S760|HA22D_ARATH\tsp|Q9S760|HA22D_ARATH HVA22-like protein d OS=Arabidopsis thaliana OX=3702 GN=HVA22D PE=2 SV=1\n+Soltu.DM.02G020600.1\tsp|Q6R2J8|SRF8_ARATH\tsp|Q6R2J8|SRF8_ARATH Protein STRUBBELIG-RECEPTOR FAMILY 8 OS=Arabidopsis thaliana OX=3702 GN=SRF8 PE=1 SV=1\n+Soltu.DM.02G020600.1\tsp|Q9LUL4|SRF7_ARATH\tsp|Q9LUL4|SRF7_ARATH Protein STRUBBELIG-RECEPTOR FAMILY 7 OS=Arabidopsis thaliana OX=3702 GN=SRF7 PE=1 SV=1\n+Soltu.DM.02G020600.1\tsp|Q9C8M9|SRF6_ARATH\tsp|Q9C8M9|SRF6_ARATH Protein STRUBBELIG-RECEPTOR FAMILY 6 OS=Arabidopsis thaliana OX=3702 GN=SRF6 PE=1 SV=1\n+Soltu.DM.02G020600.1\tsp|Q6R2K1|SRF5_ARATH\tsp|Q6R2K1|SRF5_ARATH Protein STRUBBELIG-RECEPTOR FAMILY 5 OS=Arabidopsis thaliana OX=3702 GN=SRF5 PE=1 SV=1\n+Soltu.DM.02G020600.1\tsp|Q06BH3|SRF1_ARATH\tsp|Q06BH3|SRF1_ARATH Protein STRUBBELIG-RECEPTOR FAMILY 1 OS=Arabidopsis thaliana OX=3702 GN=SRF1 PE=1 SV=2\n+Soltu.DM.02G020600.1\tsp|Q9FG24|SRF2_ARATH\tsp|Q9FG24|SRF2_ARATH Protein STRUBBELIG-RECEPTOR FAMILY 2 OS=Arabidopsis thaliana OX=3702 GN=SRF2 PE=1 SV=1\n+Soltu.DM.02G020600.1\tsp|Q6R2K3|SRF3_ARATH\tsp|Q6R2K3|SRF3_ARATH Protein STRUBBELIG-RECEPTOR FAMILY 3 OS=Arabidopsis thaliana OX=3702 GN=SRF3 PE=1 SV=1\n+Soltu.DM.02G020600.1\tsp|Q6R2K2|SRF4_ARATH\tsp|Q6R2K2|SRF4_ARATH Protein STRUBBELIG-RECEPTOR FAMILY 4 OS=Arabidopsis thaliana OX=3702 GN=SRF4 PE=2 SV=1\n+Soltu.DM.02G020600.1\tsp|Q8RWZ1|SUB_ARATH\tsp|Q8RWZ1|SUB_ARATH Protein STRUBBELIG OS=Arabidopsis thaliana OX=3702 GN=SUB PE=1 SV=1\n+Soltu.DM.02G020600.1\tsp|Q9LDZ5|PBL21_ARATH\tsp|Q9LDZ5|PBL21_ARATH Probable serine/threonine-protein kinase PBL21 OS=Arabidopsis thaliana OX=3702 GN=PBL21 PE=1 SV=1\n+Soltu.DM.02G020600.1\tsp|O49339|PTI12_ARATH\tsp|O49339|PTI12_ARATH PTI1-like tyrosine-protein kinase 2 OS=Arabidopsis thaliana OX=3702 GN=PTI12 PE=1 SV=1\n+Soltu.DM.02G020600.1\tsp|B9DFG5|PTI13_ARATH\tsp|B9DFG5|PTI13_ARATH PTI1-like tyrosine-protein kinase 3 OS=Arabidopsis thaliana OX=3702 GN=PTI13 PE=1 SV=1\n+Soltu.DM.02G020600.1\tsp|Q41328|PTI1_SOLLC\tsp|Q41328|PTI1_SOLLC Pto-interacting protein 1 OS=Solanum lycopersicum OX=4081 GN=PTI1 PE=1 SV=2\n+Soltu.DM.02G020600.1\tsp|Q9LUT0|CARK1_ARATH\tsp|Q9LUT0|CARK1_ARATH Receptor-like cytoplasmic kinase 1 OS=Arabidopsis thaliana OX=3702 GN=CARK1 PE=1 SV=1\n+Soltu.DM.02G020600.1\tsp|Q0WRY5|PBL7_ARATH\tsp|Q0WRY5|PBL7_ARATH Probable serine/threonine-protein kinase PBL7 OS=Arabidopsis thaliana OX=3702 GN=PBL7 PE=1 SV=1\n+Soltu.DM.02G020600.1\tsp|O80719|Y2706_ARATH\tsp|O80719|Y2706_ARATH Probable receptor-like protein kinase At2g47060 OS=Arabidopsis thaliana OX=3702 GN=At2g47060 PE=1 SV=1\n+Soltu.DM.02G020600.1\tsp|F4JEQ2|PBL23_ARATH\tsp|F4JEQ2|PBL23_ARATH Probable serine/threonine-protein kinase PBL23 OS=Arabidopsis thaliana OX=3702 GN=PBL23 PE=2 SV=1\n+Soltu.DM.02G020600.1\tsp|Q8H1G6|PTI11_ARATH\tsp|Q8H1G6|PTI11_ARATH PTI1-like tyrosine-protein kinase 1 OS=Arabidopsis thaliana OX=3702 GN=PTI11 PE=1 SV=1\n+Soltu.DM.02G020600.1\tsp|Q9SFT7|PBL26_ARATH\tsp|Q9SFT7|PBL26_ARATH Probable serine/threonine-protein kinase PBL26 OS=Arabidopsis thaliana OX=3702 GN=PBL26 PE=2 SV=1\n+Soltu.DM.02G020600.1\tsp|Q6I5Q6|RK185_ORYSJ\tsp|Q6I5Q6|RK185_ORYSJ Receptor-like cytoplasmic kinase 185 OS=Oryza sativa subsp. japonica OX=39947 GN=RLCK185 PE=1 SV=1\n+Soltu.DM.02G020600.1\tsp|Q9FE20|PBS1_ARATH\tsp|Q9FE20|PBS1_ARATH Serine/threonine-protein kinase PBS1 OS=Arabidopsis thaliana OX=3702 GN=PBS1 PE=1 SV=1\n+Soltu.DM.02G020600.1\tsp|P93749|Y2197_ARATH\tsp|P93749|Y2197_ARATH Probable protein kinase At2g41970 OS=Arabidopsis thaliana OX=3702 GN=At2g41970 PE=2 SV=1\n+Soltu.DM.02G020600.1\tsp|Q1PE89|PBL24_ARATH\tsp|Q1PE89|PBL24_ARATH Probable serine/threonine-protein kinase PBL24 OS=Arabidopsis thaliana OX=3702 GN=PBL24 PE=2 SV=1\n+Soltu.DM.02G020600.1\tsp|Q9LRY1|PBL25_ARATH\tsp|Q9LRY1|PBL25_ARATH Probable serine/threonine-protein kinase PBL25 OS=Arabidopsis thaliana OX=3702 GN=PBL25 PE=2 SV=1\n+Soltu.DM.02G020600.1\tsp|Q9LQQ8|PBL5_ARATH\tsp|Q9LQQ8|PBL5_ARATH Probable serine/threonine-protein kinase PBL5 OS=Arabidopsis thaliana OX=3702 GN=PBL5 PE=2 SV=1\n' |
b |
diff -r 000000000000 -r c840a1c77a0a test-data/8_Proteins_vs_Trembl_blastp.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/8_Proteins_vs_Trembl_blastp.txt Tue May 10 13:18:05 2022 +0000 |
b |
b'@@ -0,0 +1,136 @@\n+Soltu.DM.02G015700.1\tsp|Q1MX30|XA21_ORYSI\tsp|Q1MX30|XA21_ORYSI Receptor kinase-like protein Xa21 OS=Oryza sativa subsp. indica OX=39946 GN=XA21 PE=1 SV=1\n+Soltu.DM.02G015700.1\tsp|Q2R2D5|XA21_ORYSJ\tsp|Q2R2D5|XA21_ORYSJ Receptor kinase-like protein Xa21 OS=Oryza sativa subsp. japonica OX=39947 GN=XA21 PE=1 SV=1\n+Soltu.DM.02G015700.1\tsp|C0LGP4|Y3475_ARATH\tsp|C0LGP4|Y3475_ARATH Probable LRR receptor-like serine/threonine-protein kinase At3g47570 OS=Arabidopsis thaliana OX=3702 GN=At3g47570 PE=2 SV=1\n+Soltu.DM.02G015700.1\tsp|C0LGT6|EFR_ARATH\tsp|C0LGT6|EFR_ARATH LRR receptor-like serine/threonine-protein kinase EFR OS=Arabidopsis thaliana OX=3702 GN=EFR PE=1 SV=1\n+Soltu.DM.02G015700.1\tsp|Q9SD62|Y3471_ARATH\tsp|Q9SD62|Y3471_ARATH Putative receptor-like protein kinase At3g47110 OS=Arabidopsis thaliana OX=3702 GN=At3g47110 PE=3 SV=1\n+Soltu.DM.02G015700.1\tsp|Q9ZUI0|Y2241_ARATH\tsp|Q9ZUI0|Y2241_ARATH Putative leucine-rich repeat receptor-like serine/threonine-protein kinase At2g24130 OS=Arabidopsis thaliana OX=3702 GN=At2g24130 PE=3 SV=1\n+Soltu.DM.02G015700.1\tsp|Q9FL28|FLS2_ARATH\tsp|Q9FL28|FLS2_ARATH LRR receptor-like serine/threonine-protein kinase FLS2 OS=Arabidopsis thaliana OX=3702 GN=FLS2 PE=1 SV=1\n+Soltu.DM.02G015700.1\tsp|Q9SJG2|Y2296_ARATH\tsp|Q9SJG2|Y2296_ARATH Probable receptor-like protein kinase At2g42960 OS=Arabidopsis thaliana OX=3702 GN=At2g42960 PE=3 SV=1\n+Soltu.DM.02G015700.1\tsp|Q9LRP3|Y3174_ARATH\tsp|Q9LRP3|Y3174_ARATH Probable receptor-like protein kinase At3g17420 OS=Arabidopsis thaliana OX=3702 GN=At3g17420 PE=1 SV=1\n+Soltu.DM.02G015700.1\tsp|Q9LYN8|EMS1_ARATH\tsp|Q9LYN8|EMS1_ARATH Leucine-rich repeat receptor protein kinase EMS1 OS=Arabidopsis thaliana OX=3702 GN=EMS1 PE=1 SV=1\n+Soltu.DM.02G015700.1\tsp|Q3EDL4|Y1154_ARATH\tsp|Q3EDL4|Y1154_ARATH Probable serine/threonine-protein kinase At1g01540 OS=Arabidopsis thaliana OX=3702 GN=At1g01540 PE=1 SV=2\n+Soltu.DM.02G015700.1\tsp|C0LGF4|FEI1_ARATH\tsp|C0LGF4|FEI1_ARATH LRR receptor-like serine/threonine-protein kinase FEI 1 OS=Arabidopsis thaliana OX=3702 GN=FEI1 PE=1 SV=1\n+Soltu.DM.02G015700.1\tsp|C0LGQ5|GSO1_ARATH\tsp|C0LGQ5|GSO1_ARATH LRR receptor-like serine/threonine-protein kinase GSO1 OS=Arabidopsis thaliana OX=3702 GN=GSO1 PE=1 SV=1\n+Soltu.DM.02G015700.1\tsp|C0LGJ1|Y1743_ARATH\tsp|C0LGJ1|Y1743_ARATH Probable LRR receptor-like serine/threonine-protein kinase At1g74360 OS=Arabidopsis thaliana OX=3702 GN=At1g74360 PE=1 SV=1\n+Soltu.DM.02G015700.1\tsp|Q9FZB1|Y5188_ARATH\tsp|Q9FZB1|Y5188_ARATH Probable LRR receptor-like serine/threonine-protein kinase At1g51880 OS=Arabidopsis thaliana OX=3702 GN=At1g51880 PE=2 SV=1\n+Soltu.DM.02G015700.1\tsp|Q9SHI2|Y1723_ARATH\tsp|Q9SHI2|Y1723_ARATH Leucine-rich repeat receptor-like serine/threonine-protein kinase At1g17230 OS=Arabidopsis thaliana OX=3702 GN=At1g17230 PE=1 SV=2\n+Soltu.DM.02G015700.1\tsp|Q9C7S5|PSYR1_ARATH\tsp|Q9C7S5|PSYR1_ARATH Tyrosine-sulfated glycopeptide receptor 1 OS=Arabidopsis thaliana OX=3702 GN=PSY1R PE=1 SV=1\n+Soltu.DM.02G015700.1\tsp|O49564|CRK27_ARATH\tsp|O49564|CRK27_ARATH Cysteine-rich receptor-like protein kinase 27 OS=Arabidopsis thaliana OX=3702 GN=CRK27 PE=3 SV=2\n+Soltu.DM.02G015700.1\tsp|Q0JA29|FLS2_ORYSJ\tsp|Q0JA29|FLS2_ORYSJ LRR receptor-like serine/threonine-protein kinase FLS2 OS=Oryza sativa subsp. japonica OX=39947 GN=FLS2 PE=1 SV=1\n+Soltu.DM.02G015700.1\tsp|Q7G768|BRL2_ORYSJ\tsp|Q7G768|BRL2_ORYSJ Brassinosteroid LRR receptor kinase BRL2 OS=Oryza sativa subsp. japonica OX=39947 GN=BRL2 PE=2 SV=1\n+Soltu.DM.02G015700.1\tsp|Q9FGL5|CEPR1_ARATH\tsp|Q9FGL5|CEPR1_ARATH Receptor protein-tyrosine kinase CEPR1 OS=Arabidopsis thaliana OX=3702 GN=CEPR1 PE=1 SV=1\n+Soltu.DM.02G015700.1\tsp|Q8LPB4|PSKR1_DAUCA\tsp|Q8LPB4|PSKR1_DAUCA Phytosulfokine receptor 1 OS=Daucus carota OX=4039 GN=PSKR PE=1 SV=1\n+Soltu.DM.02G015700.1\tsp|C0LGL9|FEI2_ARATH\tsp|C0LGL9|FEI2_ARATH LRR receptor-like serine/threonine-protein kinase FEI 2 OS=Arabidopsis thaliana OX=3702 GN=FEI2 PE=1 SV=1\n+Soltu.DM.02G015700.1\tsp|O22476|BRI1_ARATH\tsp|O22476|BRI'..b'.01G045390.1\tsp|Q9S760|HA22D_ARATH\tsp|Q9S760|HA22D_ARATH HVA22-like protein d OS=Arabidopsis thaliana OX=3702 GN=HVA22D PE=2 SV=1\n+Soltu.DM.02G020600.1\tsp|Q6R2J8|SRF8_ARATH\tsp|Q6R2J8|SRF8_ARATH Protein STRUBBELIG-RECEPTOR FAMILY 8 OS=Arabidopsis thaliana OX=3702 GN=SRF8 PE=1 SV=1\n+Soltu.DM.02G020600.1\tsp|Q9LUL4|SRF7_ARATH\tsp|Q9LUL4|SRF7_ARATH Protein STRUBBELIG-RECEPTOR FAMILY 7 OS=Arabidopsis thaliana OX=3702 GN=SRF7 PE=1 SV=1\n+Soltu.DM.02G020600.1\tsp|Q9C8M9|SRF6_ARATH\tsp|Q9C8M9|SRF6_ARATH Protein STRUBBELIG-RECEPTOR FAMILY 6 OS=Arabidopsis thaliana OX=3702 GN=SRF6 PE=1 SV=1\n+Soltu.DM.02G020600.1\tsp|Q6R2K1|SRF5_ARATH\tsp|Q6R2K1|SRF5_ARATH Protein STRUBBELIG-RECEPTOR FAMILY 5 OS=Arabidopsis thaliana OX=3702 GN=SRF5 PE=1 SV=1\n+Soltu.DM.02G020600.1\tsp|Q06BH3|SRF1_ARATH\tsp|Q06BH3|SRF1_ARATH Protein STRUBBELIG-RECEPTOR FAMILY 1 OS=Arabidopsis thaliana OX=3702 GN=SRF1 PE=1 SV=2\n+Soltu.DM.02G020600.1\tsp|Q9FG24|SRF2_ARATH\tsp|Q9FG24|SRF2_ARATH Protein STRUBBELIG-RECEPTOR FAMILY 2 OS=Arabidopsis thaliana OX=3702 GN=SRF2 PE=1 SV=1\n+Soltu.DM.02G020600.1\tsp|Q6R2K3|SRF3_ARATH\tsp|Q6R2K3|SRF3_ARATH Protein STRUBBELIG-RECEPTOR FAMILY 3 OS=Arabidopsis thaliana OX=3702 GN=SRF3 PE=1 SV=1\n+Soltu.DM.02G020600.1\tsp|Q6R2K2|SRF4_ARATH\tsp|Q6R2K2|SRF4_ARATH Protein STRUBBELIG-RECEPTOR FAMILY 4 OS=Arabidopsis thaliana OX=3702 GN=SRF4 PE=2 SV=1\n+Soltu.DM.02G020600.1\tsp|Q8RWZ1|SUB_ARATH\tsp|Q8RWZ1|SUB_ARATH Protein STRUBBELIG OS=Arabidopsis thaliana OX=3702 GN=SUB PE=1 SV=1\n+Soltu.DM.02G020600.1\tsp|Q9LDZ5|PBL21_ARATH\tsp|Q9LDZ5|PBL21_ARATH Probable serine/threonine-protein kinase PBL21 OS=Arabidopsis thaliana OX=3702 GN=PBL21 PE=1 SV=1\n+Soltu.DM.02G020600.1\tsp|O49339|PTI12_ARATH\tsp|O49339|PTI12_ARATH PTI1-like tyrosine-protein kinase 2 OS=Arabidopsis thaliana OX=3702 GN=PTI12 PE=1 SV=1\n+Soltu.DM.02G020600.1\tsp|B9DFG5|PTI13_ARATH\tsp|B9DFG5|PTI13_ARATH PTI1-like tyrosine-protein kinase 3 OS=Arabidopsis thaliana OX=3702 GN=PTI13 PE=1 SV=1\n+Soltu.DM.02G020600.1\tsp|Q41328|PTI1_SOLLC\tsp|Q41328|PTI1_SOLLC Pto-interacting protein 1 OS=Solanum lycopersicum OX=4081 GN=PTI1 PE=1 SV=2\n+Soltu.DM.02G020600.1\tsp|Q9LUT0|CARK1_ARATH\tsp|Q9LUT0|CARK1_ARATH Receptor-like cytoplasmic kinase 1 OS=Arabidopsis thaliana OX=3702 GN=CARK1 PE=1 SV=1\n+Soltu.DM.02G020600.1\tsp|Q0WRY5|PBL7_ARATH\tsp|Q0WRY5|PBL7_ARATH Probable serine/threonine-protein kinase PBL7 OS=Arabidopsis thaliana OX=3702 GN=PBL7 PE=1 SV=1\n+Soltu.DM.02G020600.1\tsp|O80719|Y2706_ARATH\tsp|O80719|Y2706_ARATH Probable receptor-like protein kinase At2g47060 OS=Arabidopsis thaliana OX=3702 GN=At2g47060 PE=1 SV=1\n+Soltu.DM.02G020600.1\tsp|F4JEQ2|PBL23_ARATH\tsp|F4JEQ2|PBL23_ARATH Probable serine/threonine-protein kinase PBL23 OS=Arabidopsis thaliana OX=3702 GN=PBL23 PE=2 SV=1\n+Soltu.DM.02G020600.1\tsp|Q8H1G6|PTI11_ARATH\tsp|Q8H1G6|PTI11_ARATH PTI1-like tyrosine-protein kinase 1 OS=Arabidopsis thaliana OX=3702 GN=PTI11 PE=1 SV=1\n+Soltu.DM.02G020600.1\tsp|Q9SFT7|PBL26_ARATH\tsp|Q9SFT7|PBL26_ARATH Probable serine/threonine-protein kinase PBL26 OS=Arabidopsis thaliana OX=3702 GN=PBL26 PE=2 SV=1\n+Soltu.DM.02G020600.1\tsp|Q6I5Q6|RK185_ORYSJ\tsp|Q6I5Q6|RK185_ORYSJ Receptor-like cytoplasmic kinase 185 OS=Oryza sativa subsp. japonica OX=39947 GN=RLCK185 PE=1 SV=1\n+Soltu.DM.02G020600.1\tsp|Q9FE20|PBS1_ARATH\tsp|Q9FE20|PBS1_ARATH Serine/threonine-protein kinase PBS1 OS=Arabidopsis thaliana OX=3702 GN=PBS1 PE=1 SV=1\n+Soltu.DM.02G020600.1\tsp|P93749|Y2197_ARATH\tsp|P93749|Y2197_ARATH Probable protein kinase At2g41970 OS=Arabidopsis thaliana OX=3702 GN=At2g41970 PE=2 SV=1\n+Soltu.DM.02G020600.1\tsp|Q1PE89|PBL24_ARATH\tsp|Q1PE89|PBL24_ARATH Probable serine/threonine-protein kinase PBL24 OS=Arabidopsis thaliana OX=3702 GN=PBL24 PE=2 SV=1\n+Soltu.DM.02G020600.1\tsp|Q9LRY1|PBL25_ARATH\tsp|Q9LRY1|PBL25_ARATH Probable serine/threonine-protein kinase PBL25 OS=Arabidopsis thaliana OX=3702 GN=PBL25 PE=2 SV=1\n+Soltu.DM.02G020600.1\tsp|Q9LQQ8|PBL5_ARATH\tsp|Q9LQQ8|PBL5_ARATH Probable serine/threonine-protein kinase PBL5 OS=Arabidopsis thaliana OX=3702 GN=PBL5 PE=2 SV=1\n' |
b |
diff -r 000000000000 -r c840a1c77a0a test-data/blacklist_stitle_regexs.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/blacklist_stitle_regexs.txt Tue May 10 13:18:05 2022 +0000 |
b |
@@ -0,0 +1,11 @@ +(?i)\bsimilar\s+to +(?i)\bprobable\b +(?i)\bputative\b +(?i)\bpredicted\b +(?i)\buncharacterized\b +(?i)\bunknown\b +(?i)\bhypothetical\b +(?i)\bunnamed\b +(?i)\bfragment\b +(?i)\bwhole\s+genome\s+shotgun\s+sequence\b +(?i)\bclone\b \ No newline at end of file |