Mercurial > repos > onnodg > blast_annotations_processor
comparison blast_annotations_processor.py @ 3:ca2f07b71581 draft default tip
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_annotations_tool commit 600e5a50a13a3a16a1970d6d4d31cb4f7bd549bf-dirty
| author | onnodg |
|---|---|
| date | Thu, 12 Feb 2026 13:52:07 +0000 |
| parents | 9ca209477dfd |
| children |
comparison
equal
deleted
inserted
replaced
| 2:9ca209477dfd | 3:ca2f07b71581 |
|---|---|
| 49 parser.add_argument('--header-anno', help='Output path for header annotations (tabular/xlsx)') | 49 parser.add_argument('--header-anno', help='Output path for header annotations (tabular/xlsx)') |
| 50 parser.add_argument('--log', help='Output path for log file (txt)', required=True) | 50 parser.add_argument('--log', help='Output path for log file (txt)', required=True) |
| 51 parser.add_argument('--filtered-fasta', required=True, | 51 parser.add_argument('--filtered-fasta', required=True, |
| 52 help='Filtered fasta file (fasta format) for downstream analysis') | 52 help='Filtered fasta file (fasta format) for downstream analysis') |
| 53 | 53 |
| 54 parser.add_argument('--uncertain-threshold', type=float, default=0.9, required=True, | 54 parser.add_argument('--uncertain-threshold', type=float, default=1, required=True, |
| 55 help='Threshold for resolving taxonomic conflicts (default: 0.9)') | 55 help='Threshold for resolving taxonomic conflicts') |
| 56 parser.add_argument('--eval-threshold', default='1e-10', type=float, required=True, | 56 parser.add_argument('--eval-threshold', type=float, required=True, |
| 57 help='E-value threshold for filtering results (default: 1e-10)') | 57 help='E-value threshold for filtering results') |
| 58 parser.add_argument('--use-counts', action='store_true', default=False, required=False, | 58 parser.add_argument('--use-counts', action='store_true', default=False, required=False, |
| 59 help='Use read counts in circular data') | 59 help='Use read counts in circular data') |
| 60 parser.add_argument('--ignore-rank', default='unknown', required=False, | 60 parser.add_argument('--ignore-rank', default='', required=False, |
| 61 help='Ignore rank when containing this text') | 61 help='Ignore rank when containing this text') |
| 62 parser.add_argument('--ignore-taxonomy', default='environmental', required=False, | 62 parser.add_argument('--ignore-taxonomy', default='', required=False, |
| 63 help="Don't use taxonomy containing this taxonomy") | 63 help="Don't use taxonomy containing this taxonomy") |
| 64 parser.add_argument('--bitscore-perc-cutoff', type=float, default=8, required=True, | 64 parser.add_argument('--bitscore-perc-cutoff', type=float, default=0, required=True, |
| 65 help='Bitscore percentage cutoff for considered hits') | 65 help='Bitscore percentage cutoff for considered hits') |
| 66 parser.add_argument('--min-bitscore', type=int, default=0, required=True, | 66 parser.add_argument('--min-bitscore', type=int, default=0, required=True, |
| 67 help='Minimum bitscore threshold for hits') | 67 help='Minimum bitscore threshold for hits') |
| 68 parser.add_argument('-iot', '--ignore-obiclean-type', type=str, default='singleton', required=False, | 68 parser.add_argument('-iot', '--ignore-obiclean-type', type=str, default='singleton', required=False, |
| 69 help='Ignore sequences with this obiclean type') | 69 help='Ignore sequences with this obiclean type') |
| 480 for line in f: | 480 for line in f: |
| 481 if line.startswith('>'): | 481 if line.startswith('>'): |
| 482 total_headers += 1 | 482 total_headers += 1 |
| 483 header_line = line.rstrip("\n") | 483 header_line = line.rstrip("\n") |
| 484 header = header_line.split()[0].strip('>') | 484 header = header_line.split()[0].strip('>') |
| 485 if "count=" in header_line: | 485 if "obiclean_count={'XXX': " in header_line: |
| 486 count = int(header_line.split("obiclean_count={'XXX': ")[1].split("}")[0]) | |
| 487 elif "count=" in header_line: | |
| 486 count = int(header_line.split("count=")[1].split(";")[0]) | 488 count = int(header_line.split("count=")[1].split(";")[0]) |
| 487 else: | 489 else: |
| 488 count = 0 | 490 count = 0 |
| 489 | 491 |
| 490 passes_header_filter = check_header_string(header_line, ignore_illuminapairend_type, | 492 passes_header_filter = check_header_string(header_line, ignore_illuminapairend_type, |
| 711 unanno_set = set(unanno_headers_ordered) | 713 unanno_set = set(unanno_headers_ordered) |
| 712 | 714 |
| 713 if not os.path.exists(anno_file_path): | 715 if not os.path.exists(anno_file_path): |
| 714 log_message(log_messages, f"Error: Input file {anno_file_path} not found") | 716 log_message(log_messages, f"Error: Input file {anno_file_path} not found") |
| 715 with open(args.log, 'w') as f: | 717 with open(args.log, 'w') as f: |
| 716 print('gaat nog niet goed hoor') | |
| 717 f.write("\n".join(log_messages)) | 718 f.write("\n".join(log_messages)) |
| 718 return | 719 return |
| 719 | 720 |
| 720 log_message(log_messages, f"Reading BLAST annotations") | 721 log_message(log_messages, f"Reading BLAST annotations") |
| 721 blast_groups = parse_blast_output(anno_file_path, args.eval_threshold, args.ignore_taxonomy, args.ignore_rank, | 722 blast_groups = parse_blast_output(anno_file_path, args.eval_threshold, args.ignore_taxonomy, args.ignore_rank, |
