view filter_compounds.py @ 1:6c5df3eeb768 draft default tip

planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/filter_compounds commit b757d35278216a1f43bd53ea687a951b005e47df
author recetox
date Wed, 20 Mar 2024 12:20:17 +0000
parents 72b687d21f65
children
line wrap: on
line source

import argparse
import re

from openbabel import openbabel, pybel
openbabel.obErrorLog.StopLogging()


def parse_command_line():
    parser = argparse.ArgumentParser()
    parser.add_argument('-i', '--input', required=True, help='Input file name')
    parser.add_argument('-o', '--output', required=True, help='Output file name')
    parser.add_argument('-m', '--met', required=False, action='store_true', help='Remove organometallic compounds')
    parser.add_argument('-a', '--anorg', required=False, action='store_true', help='Remove anorganic compounds')
    return parser.parse_args()


def filter_compounds(args, pattern):
    print(pattern)
    with open(args.input, "r") as infile, open(args.output, "w") as outfile:
        for line in infile:
            values = line.split('\t', 1)

            # check if input is list of SMILES or indexed table of SMILES
            if values[0].isnumeric():
                mol = pybel.readstring('smi', values[1]).write('inchi').split('/')[1] if values[1].strip() else ''

                # check if both organometallic and anorganic filtering passes
                # write original line if compound is organic without metals
                if False not in ([bool(re.search(rf'{x}', mol)) for x in pattern]):
                    outfile.write(line)
                else:
                    outfile.write(f'{values[0]}\t{""}\n')
            else:
                mol = pybel.readstring('smi', values[0]).write('inchi').split('/')[1]
                if False not in ([bool(re.search(rf'{x}', mol)) for x in pattern]):
                    outfile.write(line)


def __main__():
    """
        Filter organometallics and/or anorganic compounds.
    """
    args = parse_command_line()

    # check if user selected something to filter out, if not output file == input file
    sel_pattern = []
    if args.met is False and args.anorg is False:
        print("No filtering selected - user did not specify what to filter out.")
        sel_pattern = r'^[a-zA-Z]+$'
    # select patterns for filtering
    if args.met:
        sel_pattern.append(r'^(?:C|N|O|P|F|S|I|B|Si|Se|Cl|Br|Li|Na|H|K|[0-9]|\.)+$')
    if args.anorg:
        sel_pattern.append(r'[C][^abd-z]')

    filter_compounds(args, sel_pattern)


if __name__ == "__main__":
    __main__()