annotate structure_pipeline.py @ 9:a446ea7e2bc1 draft default tip

"planemo upload for repository https://github.com/chembl/chembl_webresource_client commit 7ca965e469cce3951d22c854dc7b6cb2a3a4f9f6"
author bgruening
date Tue, 23 Mar 2021 13:48:35 +0000
parents a57de37f12c2
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
6
a57de37f12c2 "planemo upload for repository https://github.com/chembl/chembl_webresource_client commit 78f2261af4e00c830ea311337d0aed9b297aad8e"
bgruening
parents:
diff changeset
1 import argparse
a57de37f12c2 "planemo upload for repository https://github.com/chembl/chembl_webresource_client commit 78f2261af4e00c830ea311337d0aed9b297aad8e"
bgruening
parents:
diff changeset
2
a57de37f12c2 "planemo upload for repository https://github.com/chembl/chembl_webresource_client commit 78f2261af4e00c830ea311337d0aed9b297aad8e"
bgruening
parents:
diff changeset
3 from chembl_structure_pipeline import checker, standardizer
a57de37f12c2 "planemo upload for repository https://github.com/chembl/chembl_webresource_client commit 78f2261af4e00c830ea311337d0aed9b297aad8e"
bgruening
parents:
diff changeset
4
a57de37f12c2 "planemo upload for repository https://github.com/chembl/chembl_webresource_client commit 78f2261af4e00c830ea311337d0aed9b297aad8e"
bgruening
parents:
diff changeset
5
a57de37f12c2 "planemo upload for repository https://github.com/chembl/chembl_webresource_client commit 78f2261af4e00c830ea311337d0aed9b297aad8e"
bgruening
parents:
diff changeset
6 def load_mols(input_file):
a57de37f12c2 "planemo upload for repository https://github.com/chembl/chembl_webresource_client commit 78f2261af4e00c830ea311337d0aed9b297aad8e"
bgruening
parents:
diff changeset
7 """
a57de37f12c2 "planemo upload for repository https://github.com/chembl/chembl_webresource_client commit 78f2261af4e00c830ea311337d0aed9b297aad8e"
bgruening
parents:
diff changeset
8 Returns a list of strings, each a molblock
a57de37f12c2 "planemo upload for repository https://github.com/chembl/chembl_webresource_client commit 78f2261af4e00c830ea311337d0aed9b297aad8e"
bgruening
parents:
diff changeset
9 """
a57de37f12c2 "planemo upload for repository https://github.com/chembl/chembl_webresource_client commit 78f2261af4e00c830ea311337d0aed9b297aad8e"
bgruening
parents:
diff changeset
10 with open(input_file) as f:
a57de37f12c2 "planemo upload for repository https://github.com/chembl/chembl_webresource_client commit 78f2261af4e00c830ea311337d0aed9b297aad8e"
bgruening
parents:
diff changeset
11 mols = [''.join(('\n', mol.strip())) for mol in f.read().strip().split('$$$$\n')]
a57de37f12c2 "planemo upload for repository https://github.com/chembl/chembl_webresource_client commit 78f2261af4e00c830ea311337d0aed9b297aad8e"
bgruening
parents:
diff changeset
12 return mols
a57de37f12c2 "planemo upload for repository https://github.com/chembl/chembl_webresource_client commit 78f2261af4e00c830ea311337d0aed9b297aad8e"
bgruening
parents:
diff changeset
13
a57de37f12c2 "planemo upload for repository https://github.com/chembl/chembl_webresource_client commit 78f2261af4e00c830ea311337d0aed9b297aad8e"
bgruening
parents:
diff changeset
14
a57de37f12c2 "planemo upload for repository https://github.com/chembl/chembl_webresource_client commit 78f2261af4e00c830ea311337d0aed9b297aad8e"
bgruening
parents:
diff changeset
15 def write_mols(mols, output_file):
a57de37f12c2 "planemo upload for repository https://github.com/chembl/chembl_webresource_client commit 78f2261af4e00c830ea311337d0aed9b297aad8e"
bgruening
parents:
diff changeset
16 """
a57de37f12c2 "planemo upload for repository https://github.com/chembl/chembl_webresource_client commit 78f2261af4e00c830ea311337d0aed9b297aad8e"
bgruening
parents:
diff changeset
17 Writes a list of molblocks to an SDF
a57de37f12c2 "planemo upload for repository https://github.com/chembl/chembl_webresource_client commit 78f2261af4e00c830ea311337d0aed9b297aad8e"
bgruening
parents:
diff changeset
18 """
a57de37f12c2 "planemo upload for repository https://github.com/chembl/chembl_webresource_client commit 78f2261af4e00c830ea311337d0aed9b297aad8e"
bgruening
parents:
diff changeset
19 with open(output_file, 'w') as f:
a57de37f12c2 "planemo upload for repository https://github.com/chembl/chembl_webresource_client commit 78f2261af4e00c830ea311337d0aed9b297aad8e"
bgruening
parents:
diff changeset
20 f.write('\n$$$$'.join(mols))
a57de37f12c2 "planemo upload for repository https://github.com/chembl/chembl_webresource_client commit 78f2261af4e00c830ea311337d0aed9b297aad8e"
bgruening
parents:
diff changeset
21
a57de37f12c2 "planemo upload for repository https://github.com/chembl/chembl_webresource_client commit 78f2261af4e00c830ea311337d0aed9b297aad8e"
bgruening
parents:
diff changeset
22
a57de37f12c2 "planemo upload for repository https://github.com/chembl/chembl_webresource_client commit 78f2261af4e00c830ea311337d0aed9b297aad8e"
bgruening
parents:
diff changeset
23 def standardize_molblock(mol):
a57de37f12c2 "planemo upload for repository https://github.com/chembl/chembl_webresource_client commit 78f2261af4e00c830ea311337d0aed9b297aad8e"
bgruening
parents:
diff changeset
24 return standardizer.standardize_molblock(mol)
a57de37f12c2 "planemo upload for repository https://github.com/chembl/chembl_webresource_client commit 78f2261af4e00c830ea311337d0aed9b297aad8e"
bgruening
parents:
diff changeset
25
a57de37f12c2 "planemo upload for repository https://github.com/chembl/chembl_webresource_client commit 78f2261af4e00c830ea311337d0aed9b297aad8e"
bgruening
parents:
diff changeset
26
a57de37f12c2 "planemo upload for repository https://github.com/chembl/chembl_webresource_client commit 78f2261af4e00c830ea311337d0aed9b297aad8e"
bgruening
parents:
diff changeset
27 def get_parent_molblock(mol):
a57de37f12c2 "planemo upload for repository https://github.com/chembl/chembl_webresource_client commit 78f2261af4e00c830ea311337d0aed9b297aad8e"
bgruening
parents:
diff changeset
28 return standardizer.get_parent_molblock(mol)[0]
a57de37f12c2 "planemo upload for repository https://github.com/chembl/chembl_webresource_client commit 78f2261af4e00c830ea311337d0aed9b297aad8e"
bgruening
parents:
diff changeset
29
a57de37f12c2 "planemo upload for repository https://github.com/chembl/chembl_webresource_client commit 78f2261af4e00c830ea311337d0aed9b297aad8e"
bgruening
parents:
diff changeset
30
a57de37f12c2 "planemo upload for repository https://github.com/chembl/chembl_webresource_client commit 78f2261af4e00c830ea311337d0aed9b297aad8e"
bgruening
parents:
diff changeset
31 def check_molblock(mol):
a57de37f12c2 "planemo upload for repository https://github.com/chembl/chembl_webresource_client commit 78f2261af4e00c830ea311337d0aed9b297aad8e"
bgruening
parents:
diff changeset
32 issues = checker.check_molblock(mol)
a57de37f12c2 "planemo upload for repository https://github.com/chembl/chembl_webresource_client commit 78f2261af4e00c830ea311337d0aed9b297aad8e"
bgruening
parents:
diff changeset
33 max_penalty_score = str(max([issue[0] for issue in issues])) if issues else '0'
a57de37f12c2 "planemo upload for repository https://github.com/chembl/chembl_webresource_client commit 78f2261af4e00c830ea311337d0aed9b297aad8e"
bgruening
parents:
diff changeset
34 message = '; '.join([issue[1] for issue in issues])
a57de37f12c2 "planemo upload for repository https://github.com/chembl/chembl_webresource_client commit 78f2261af4e00c830ea311337d0aed9b297aad8e"
bgruening
parents:
diff changeset
35 mol_with_issues = '\n'.join((mol, '> <MaxPenaltyScore>', max_penalty_score, '> <IssueMessages>', message))
a57de37f12c2 "planemo upload for repository https://github.com/chembl/chembl_webresource_client commit 78f2261af4e00c830ea311337d0aed9b297aad8e"
bgruening
parents:
diff changeset
36 return mol_with_issues
a57de37f12c2 "planemo upload for repository https://github.com/chembl/chembl_webresource_client commit 78f2261af4e00c830ea311337d0aed9b297aad8e"
bgruening
parents:
diff changeset
37
a57de37f12c2 "planemo upload for repository https://github.com/chembl/chembl_webresource_client commit 78f2261af4e00c830ea311337d0aed9b297aad8e"
bgruening
parents:
diff changeset
38
a57de37f12c2 "planemo upload for repository https://github.com/chembl/chembl_webresource_client commit 78f2261af4e00c830ea311337d0aed9b297aad8e"
bgruening
parents:
diff changeset
39 def main():
a57de37f12c2 "planemo upload for repository https://github.com/chembl/chembl_webresource_client commit 78f2261af4e00c830ea311337d0aed9b297aad8e"
bgruening
parents:
diff changeset
40 parser = argparse.ArgumentParser(description='Search ChEMBL database for compounds')
a57de37f12c2 "planemo upload for repository https://github.com/chembl/chembl_webresource_client commit 78f2261af4e00c830ea311337d0aed9b297aad8e"
bgruening
parents:
diff changeset
41 parser.add_argument('-i', '--input', help='SDF/MOL input')
a57de37f12c2 "planemo upload for repository https://github.com/chembl/chembl_webresource_client commit 78f2261af4e00c830ea311337d0aed9b297aad8e"
bgruening
parents:
diff changeset
42 parser.add_argument('-o', '--output', help="Standardized output")
a57de37f12c2 "planemo upload for repository https://github.com/chembl/chembl_webresource_client commit 78f2261af4e00c830ea311337d0aed9b297aad8e"
bgruening
parents:
diff changeset
43 parser.add_argument('--standardize', action='store_true', help="Standardize molblock")
a57de37f12c2 "planemo upload for repository https://github.com/chembl/chembl_webresource_client commit 78f2261af4e00c830ea311337d0aed9b297aad8e"
bgruening
parents:
diff changeset
44 parser.add_argument('--get_parent', action='store_true', help="Get parent molblock.")
a57de37f12c2 "planemo upload for repository https://github.com/chembl/chembl_webresource_client commit 78f2261af4e00c830ea311337d0aed9b297aad8e"
bgruening
parents:
diff changeset
45 parser.add_argument('--check', action='store_true', help="Check molblock")
a57de37f12c2 "planemo upload for repository https://github.com/chembl/chembl_webresource_client commit 78f2261af4e00c830ea311337d0aed9b297aad8e"
bgruening
parents:
diff changeset
46 args = parser.parse_args()
a57de37f12c2 "planemo upload for repository https://github.com/chembl/chembl_webresource_client commit 78f2261af4e00c830ea311337d0aed9b297aad8e"
bgruening
parents:
diff changeset
47
a57de37f12c2 "planemo upload for repository https://github.com/chembl/chembl_webresource_client commit 78f2261af4e00c830ea311337d0aed9b297aad8e"
bgruening
parents:
diff changeset
48 mols = load_mols(args.input)
a57de37f12c2 "planemo upload for repository https://github.com/chembl/chembl_webresource_client commit 78f2261af4e00c830ea311337d0aed9b297aad8e"
bgruening
parents:
diff changeset
49 if args.standardize:
a57de37f12c2 "planemo upload for repository https://github.com/chembl/chembl_webresource_client commit 78f2261af4e00c830ea311337d0aed9b297aad8e"
bgruening
parents:
diff changeset
50 mols = [standardize_molblock(mol) for mol in mols]
a57de37f12c2 "planemo upload for repository https://github.com/chembl/chembl_webresource_client commit 78f2261af4e00c830ea311337d0aed9b297aad8e"
bgruening
parents:
diff changeset
51 if args.get_parent:
a57de37f12c2 "planemo upload for repository https://github.com/chembl/chembl_webresource_client commit 78f2261af4e00c830ea311337d0aed9b297aad8e"
bgruening
parents:
diff changeset
52 mols = [get_parent_molblock(mol) for mol in mols]
a57de37f12c2 "planemo upload for repository https://github.com/chembl/chembl_webresource_client commit 78f2261af4e00c830ea311337d0aed9b297aad8e"
bgruening
parents:
diff changeset
53 if args.check:
a57de37f12c2 "planemo upload for repository https://github.com/chembl/chembl_webresource_client commit 78f2261af4e00c830ea311337d0aed9b297aad8e"
bgruening
parents:
diff changeset
54 mols = [check_molblock(mol) for mol in mols]
a57de37f12c2 "planemo upload for repository https://github.com/chembl/chembl_webresource_client commit 78f2261af4e00c830ea311337d0aed9b297aad8e"
bgruening
parents:
diff changeset
55 write_mols(mols, args.output)
a57de37f12c2 "planemo upload for repository https://github.com/chembl/chembl_webresource_client commit 78f2261af4e00c830ea311337d0aed9b297aad8e"
bgruening
parents:
diff changeset
56
a57de37f12c2 "planemo upload for repository https://github.com/chembl/chembl_webresource_client commit 78f2261af4e00c830ea311337d0aed9b297aad8e"
bgruening
parents:
diff changeset
57
a57de37f12c2 "planemo upload for repository https://github.com/chembl/chembl_webresource_client commit 78f2261af4e00c830ea311337d0aed9b297aad8e"
bgruening
parents:
diff changeset
58 if __name__ == "__main__":
a57de37f12c2 "planemo upload for repository https://github.com/chembl/chembl_webresource_client commit 78f2261af4e00c830ea311337d0aed9b297aad8e"
bgruening
parents:
diff changeset
59 main()