comparison extract_tables.py @ 9:a62c4a11a67d draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ena_upload commit 6770d277b4136b4068293c4260022d4ae33b2379
author iuc
date Thu, 10 Nov 2022 15:18:00 +0000
parents 6f6537780379
children 480d9e9d156b
comparison
equal deleted inserted replaced
8:d147d6455873 9:a62c4a11a67d
17 args = parser.parse_args() 17 args = parser.parse_args()
18 18
19 with open(args.studies_json_path, 'r') as studies_json_file: 19 with open(args.studies_json_path, 'r') as studies_json_file:
20 studies_dict = json.load(studies_json_file) 20 studies_dict = json.load(studies_json_file)
21 studies_table = open(pathlib.Path(args.out_path) / 'studies.tsv', 'w') 21 studies_table = open(pathlib.Path(args.out_path) / 'studies.tsv', 'w')
22 studies_table.write('\t'.join(['alias', 'status', 'accession', 'title', 'study_type', 22 studies_table.write('\t'.join(['alias', 'status', 'title', 'study_type',
23 'study_abstract', 'pubmed_id', 'submission_date']) + '\n') 23 'study_abstract', 'pubmed_id']) + '\n')
24 samples_table = open(pathlib.Path(args.out_path) / 'samples.tsv', 'w') 24 samples_table = open(pathlib.Path(args.out_path) / 'samples.tsv', 'w')
25 experiments_table = open(pathlib.Path(args.out_path) / 'experiments.tsv', 'w') 25 experiments_table = open(pathlib.Path(args.out_path) / 'experiments.tsv', 'w')
26 experiments_table.write('\t'.join(['alias', 'status', 'accession', 'title', 'study_alias', 26 experiments_table.write('\t'.join(['alias', 'status', 'title', 'study_alias',
27 'sample_alias', 'design_description', 'library_name', 27 'sample_alias', 'design_description', 'library_name',
28 'library_strategy', 'library_source', 'library_selection', 28 'library_strategy', 'library_source', 'library_selection',
29 'library_layout', 'insert_size', 29 'library_layout', 'insert_size',
30 'library_construction_protocol', 'platform', 'instrument_model', 30 'library_construction_protocol', 'platform', 'instrument_model',
31 'submission_date']) + '\n') 31 ]) + '\n')
32 runs_table = open(pathlib.Path(args.out_path) / 'runs.tsv', 'w') 32 runs_table = open(pathlib.Path(args.out_path) / 'runs.tsv', 'w')
33 runs_table.write('\t'.join(['alias', 'status', 'accession', 'experiment_alias', 'file_name', 33 runs_table.write('\t'.join(['alias', 'status', 'experiment_alias', 'file_name',
34 'file_format', 'file_checksum', 'submission_date']) + '\n') 34 'file_format']) + '\n')
35 35
36 action = args.action 36 action = args.action
37 37
38 dt_oobj = datetime.now(tz=None) 38 dt_oobj = datetime.now(tz=None)
39 timestamp = dt_oobj.strftime("%Y%m%d_%H:%M:%S") 39 timestamp = dt_oobj.strftime("%Y%m%d_%H:%M:%S")
40 for study_index, study in enumerate(studies_dict): 40 for study_index, study in enumerate(studies_dict):
41 study_alias = 'study_' + str(study_index) + '_' + timestamp 41 study_alias = 'study_' + str(study_index) + '_' + timestamp
42 studies_table.write('\t'.join([study_alias, action, 'ENA_accession', study['title'], 42 studies_table.write('\t'.join([study_alias, action, study['title'],
43 study['type'], study['abstract'], study['pubmed_id'], 43 study['type'], study['abstract'], study['pubmed_id'],
44 'ENA_submission_data'])) 44 ]))
45 if "geo_location" in study['samples'][0].keys(): # sample belongs to a viral sample 45 if "geo_location" in study['samples'][0].keys(): # sample belongs to a viral sample
46 samples_table.write('\t'.join(['alias', 'status', 'accession', 'title', 'scientific_name', 46 samples_table.write('\t'.join(['alias', 'status', 'title', 'scientific_name',
47 'taxon_id', 'sample_description', 'collection date', 47 'taxon_id', 'sample_description', 'collection date',
48 'geographic location (country and/or sea)', 'host common name', 'host subject id', 48 'geographic location (country and/or sea)', 'host common name', 'host subject id',
49 'host health state', 'host sex', 'host scientific name', 49 'host health state', 'host sex', 'host scientific name',
50 'collector name', 'collecting institution', 'isolate', 50 'collector name', 'collecting institution', 'isolate',
51 'submission_date']) + '\n') 51 ]) + '\n')
52 else: 52 else:
53 samples_table.write('\t'.join(['alias', 'status', 'accession', 'title', 'scientific_name', 53 samples_table.write('\t'.join(['alias', 'status', 'title', 'scientific_name',
54 'taxon_id', 'sample_description', 'submission_date']) + '\n') 54 'taxon_id', 'sample_description']) + '\n')
55 for sample_index, sample in enumerate(study['samples']): 55 for sample_index, sample in enumerate(study['samples']):
56 sample_alias = 'sample_' + str(sample_index) + '_' + timestamp 56 sample_alias = 'sample_' + str(sample_index) + '_' + timestamp
57 if "geo_location" in sample.keys(): # sample belongs to a viral sample 57 if "geo_location" in sample.keys(): # sample belongs to a viral sample
58 if sample['collector_name'] == '': 58 if sample['collector_name'] == '':
59 sample['collector_name'] = 'unknown' 59 sample['collector_name'] = 'unknown'
60 samples_table.write('\t'.join([sample_alias, action, 'ena_accession', sample['title'], 60 samples_table.write('\t'.join([sample_alias, action, sample['title'],
61 sample['tax_name'], sample['tax_id'], 61 sample['tax_name'], sample['tax_id'],
62 sample['description'], sample['collection_date'], 62 sample['description'], sample['collection_date'],
63 sample['geo_location'], sample['host_common_name'], 63 sample['geo_location'], sample['host_common_name'],
64 sample['host_subject_id'], sample['host_health_state'], 64 sample['host_subject_id'], sample['host_health_state'],
65 sample['host_sex'], sample['host_scientific_name'], 65 sample['host_sex'], sample['host_scientific_name'],
66 sample['collector_name'], 66 sample['collector_name'],
67 sample['collecting_institution'], sample['isolate'], 67 sample['collecting_institution'], sample['isolate'],
68 'ENA_submission_date']) + '\n') 68 ]) + '\n')
69 else: 69 else:
70 samples_table.write('\t'.join([sample_alias, action, 'ena_accession', sample['title'], 70 samples_table.write('\t'.join([sample_alias, action, sample['title'],
71 sample['tax_name'], sample['tax_id'], 71 sample['tax_name'], sample['tax_id'],
72 sample['description'], 'ENA_submission_date']) + '\n') 72 sample['description']]) + '\n')
73 for exp_index, exp in enumerate(sample['experiments']): 73 for exp_index, exp in enumerate(sample['experiments']):
74 exp_alias = 'experiment_' + str(exp_index) + '.' + str(sample_index) + '_' + timestamp 74 exp_alias = 'experiment_' + str(exp_index) + '.' + str(sample_index) + '_' + timestamp
75 lib_alias = 'library_' + str(exp_index) + '_' + str(sample_index) 75 lib_alias = 'library_' + str(exp_index) + '_' + str(sample_index)
76 experiments_table.write('\t'.join([exp_alias, action, 'accession_ena', exp['title'], 76 experiments_table.write('\t'.join([exp_alias, action, exp['title'],
77 study_alias, sample_alias, exp['experiment_design'], 77 study_alias, sample_alias, exp['experiment_design'],
78 lib_alias, exp['library_strategy'], 78 lib_alias, exp['library_strategy'],
79 exp['library_source'], exp['library_selection'], 79 exp['library_source'], exp['library_selection'],
80 exp['library_layout'].lower(), exp['insert_size'], 80 exp['library_layout'].lower(), exp['insert_size'],
81 exp['library_construction_protocol'], 81 exp['library_construction_protocol'],
82 exp['platform'], exp['instrument_model'], 82 exp['platform'], exp['instrument_model'],
83 'submission_date_ENA']) + '\n') 83 ]) + '\n')
84 run_index = 0 84 run_index = 0
85 # exp['runs'] is a list of lists 85 # exp['runs'] is a list of lists
86 for (base_run, run_files) in exp['runs']: 86 for (base_run, run_files) in exp['runs']:
87 run_index += 1 87 run_index += 1
88 if base_run != '': 88 if base_run != '':
90 else: 90 else:
91 # no alias provided, generated a unique one 91 # no alias provided, generated a unique one
92 run_alias = '_'.join(['run_' + str(run_index), str(exp_index), 92 run_alias = '_'.join(['run_' + str(run_index), str(exp_index),
93 str(sample_index)]) + '_' + timestamp 93 str(sample_index)]) + '_' + timestamp
94 for file_entry in run_files: 94 for file_entry in run_files:
95 runs_table.write('\t'.join([run_alias, action, 'ena_run_accession', exp_alias, 95 runs_table.write('\t'.join([run_alias, action, exp_alias,
96 file_entry, FILE_FORMAT, 'file_checksum', 96 file_entry, FILE_FORMAT]) + '\n')
97 'submission_date_ENA']) + '\n')
98 97
99 studies_table.close() 98 studies_table.close()
100 samples_table.close() 99 samples_table.close()
101 experiments_table.close() 100 experiments_table.close()
102 runs_table.close() 101 runs_table.close()