Mercurial > repos > rakesh4osdd > clsi_profile
diff clsi_profile_type2_linux.ipynb @ 11:7dcc0e93288b draft default tip
"planemo upload for repository https://github.com/rakesh4osdd/asist/tree/master commit f590c3b1d71a9b8f2030909fa488b4ac0c3caed8-dirty"
author | rakesh4osdd |
---|---|
date | Wed, 30 Jun 2021 07:13:29 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/clsi_profile_type2_linux.ipynb Wed Jun 30 07:13:29 2021 +0000 @@ -0,0 +1,534 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 7, + "id": "9aa0a6f7", + "metadata": {}, + "outputs": [], + "source": [ + "# ASIST module2 | map AST result to the CLSI breakporints with combination antibiotics\n", + "# By rakesh4osdd@gmail.com, 06-Jun-2021\n", + "import pandas as pd\n", + "import re\n", + "import sys" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "9af8387e", + "metadata": {}, + "outputs": [], + "source": [ + "#print(pd.__version__, re.__version__)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "73d0783c", + "metadata": {}, + "outputs": [], + "source": [ + "# compare two MIC value strings\n", + "def check_mic(mic1,mic2,mic_type):\n", + " #print(mic1,mic2,mic_type)\n", + " try:\n", + " if '/' in mic1:\n", + " m1a = mic1.split('/')[0]\n", + " m1b = mic1.split('/')[1]\n", + " if float(m1a)==0 or float(m1b)==0:\n", + " strain_type='Strain could not be classified'\n", + " return(strain_type) \n", + " elif '/' in mic2:\n", + " m1a = mic1\n", + " if float(m1a)==0:\n", + " strain_type='Strain could not be classified'\n", + " return(strain_type) \n", + " m1b = '1'\n", + " elif float(mic1)==0:\n", + " strain_type='Strain could not be classified'\n", + " return(strain_type)\n", + " else:\n", + " m1a = mic1\n", + " \n", + " if '-' in mic2:\n", + " m2a = mic2.split('-')[0]\n", + " m2b = mic2.split('-')[1] \n", + " \n", + " except ValueError:\n", + " strain_type='Strain could not be classified' \n", + " return(strain_type)\n", + " try:\n", + " if '-' in mic2 and mic_type == 'i': # for intermediate only\n", + " if '/' in mic2:\n", + " m2a = mic2.split('-')[0].split('/')[0]\n", + " m2b = mic2.split('-')[0].split('/')[1]\n", + " m2aa = mic2.split('-')[1].split('/')[0]\n", + " m2bb = mic2.split('-')[1].split('/')[1]\n", + " if (float(m2aa)>=float(m1a)>=float(m2a) and float(m2bb)>=float(m1b)>=float(m2b)):\n", + " #print('intermediate')\n", + " m_type='Intermediate'\n", + " else:\n", + " #print('not define')\n", + " m_type='Strain could not be classified'\n", + " else:\n", + " m2a = mic2.split('-')[0]\n", + " m2b = mic2.split('-')[1] \n", + " if (float(m2b)>=float(m1a)>=float(m2a)):\n", + " #print('intermediate')\n", + " m_type='Intermediate'\n", + " else:\n", + " #print('not define')\n", + " m_type='Strain could not be classified' \n", + " #print (m1a,m1b,m2a,m2b,m2aa,m2bb)\n", + " elif '/' in mic2:\n", + " m2a = mic2.split('/')[0]\n", + " m2b = mic2.split('/')[1]\n", + " #print(m1a,m1b,m2a,m2b,mic_type)\n", + " if (mic_type=='s' and (float(m1a)<=float(m2a) and float(m1b)<=float(m2b))):\n", + " m_type='Susceptible'\n", + " elif (mic_type=='r' and (float(m1a)>=float(m2a) and float(m1b)>=float(m2b))):\n", + " m_type='Resistant'\n", + " elif (mic_type=='i' and (float(m1a)==float(m2a) and float(m1b)==float(m2b))):\n", + " m_type='Intermediate'\n", + " else:\n", + " m_type='Strain could not be classified'\n", + " elif '-' in mic2:\n", + " m_type='Strain could not be classified'\n", + " else:\n", + " m2a=mic2\n", + " if (mic_type=='s' and (float(m1a)<=float(m2a))):\n", + " m_type='Susceptible'\n", + " elif (mic_type=='r' and (float(m1a)>=float(m2a))):\n", + " m_type='Resistant'\n", + " elif (mic_type=='i' and (float(m1a)==float(m2a))):\n", + " m_type='Intermediate'\n", + " else:\n", + " m_type='Strain could not be classified' \n", + " except IndexError:\n", + " strain_type='Strain could not be classified' \n", + " return(strain_type)\n", + " \n", + " return(m_type)\n", + "\n", + "#check_mic('65','32-64','i')" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "4d2ab1b1", + "metadata": {}, + "outputs": [], + "source": [ + "# compare MIC value in pandas list\n", + "def sus_res_int(mic):\n", + " #print(mic)\n", + " o_mic = mic[0].replace(' ', '')\n", + " s_mic = mic[1].replace(' ', '')\n", + " r_mic = mic[2].replace(' ', '')\n", + " i_mic = mic[3].replace(' ', '')\n", + " try:\n", + " if check_mic(o_mic,s_mic,'s')=='Susceptible':\n", + " strain_type='Susceptible'\n", + " elif check_mic(o_mic,r_mic,'r')=='Resistant':\n", + " strain_type='Resistant'\n", + " elif check_mic(o_mic,i_mic,'i')=='Intermediate':\n", + " strain_type='Intermediate' \n", + " else:\n", + " strain_type='Strain could not be classified'\n", + " except ValueError:\n", + " strain_type='Strain could not be classified' \n", + " return(strain_type)\n", + "\n", + "#mic=['128','16/4','128/4','32/4-64/4']\n", + "#sus_res_int(mic)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "0e22ef0d", + "metadata": {}, + "outputs": [ + { + "ename": "IndexError", + "evalue": "list index out of range", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mIndexError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m<ipython-input-11-26f141926f14>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0minput_user\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msys\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0margv\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0minput_clsi\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msys\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0margv\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0moutput_table\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msys\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0margv\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;31mIndexError\u001b[0m: list index out of range" + ] + } + ], + "source": [ + "# for input argument\n", + "input_user = sys.argv[1]\n", + "input_clsi = sys.argv[2]\n", + "output_table = sys.argv[3]" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "21d5fe63", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"input_user='~/Jupyterlab_notebook/ASIST_module/strain_profiles_16k.csv.csv'\n", + "#input_user='test-data/input2.csv'\n", + "input_clsi='test-data/clsi.csv'\n", + "output_profile='test-data/input2_profile.csv'\n", + "#output_table='test-data/input2_table.csv'\n", + "output_table='/home/rakesh/Jupyterlab_notebook/ASIST_module/strain_profiles_16k_table.csv'\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "id": "1e64b025", + "metadata": {}, + "outputs": [], + "source": [ + "# read user AST data with selected 3 columns\n", + "strain_mic=pd.read_csv(input_user, sep=',', usecols =['Strain name', 'Antibiotics', 'MIC'],na_filter=False)\n", + "#strain_mic" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "id": "0d30ddc3", + "metadata": {}, + "outputs": [], + "source": [ + "clsi_bp=pd.read_csv(input_clsi,sep=',')\n", + "\n", + "#clsi_bp[clsi_bp[['Antibiotics', 'Susceptible']].duplicated()].shape" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "id": "a818676d", + "metadata": {}, + "outputs": [], + "source": [ + "#clsi_bp\n", + "#strain_mic" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "id": "c2aae757", + "metadata": {}, + "outputs": [], + "source": [ + "# warn user for duplicate files\n", + "input_dups=strain_mic[strain_mic[['Strain name','Antibiotics']].duplicated()]\n", + "if (input_dups.shape[0] == 0):\n", + " #print( \"No duplicates\")\n", + " pass\n", + "else:\n", + " with open(output_table, \"w\") as file_object:\n", + " # Append 'hello' at the end of file\n", + " file_object.write('S.No.,Strain name,Antibiotics,MIC\\nInput File Error: Please remove duplicate/mutiple MIC values for same combination of Strain name and Antibiotics from input file\\n')\n", + " input_dups.to_csv(output_table,na_rep='NA', mode='a')\n", + " exit()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "c6b4c59b", + "metadata": {}, + "outputs": [], + "source": [ + "# convert MIC to numbers sMIC, rMIC\n", + "clsi_bp['s_mic'] =clsi_bp[['Susceptible']].applymap(lambda x: (re.sub(r'[^0-9.\\/-]', '', x)))\n", + "clsi_bp['r_mic'] =clsi_bp[['Resistant']].applymap(lambda x: (re.sub(r'[^0-9.\\/-]', '', x)))\n", + "clsi_bp['i_mic'] = clsi_bp[['Intermediate']].applymap(lambda x: (re.sub(r'[^0-9.\\/-]', '', x)))" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "d0171f94", + "metadata": {}, + "outputs": [], + "source": [ + "#clsi_bp['i_mic'] = clsi_bp[['Intermediate']].applymap(lambda x: (re.sub(r'[^0-9.\\/-]', '', x)))" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "fe45b2dd", + "metadata": {}, + "outputs": [], + "source": [ + "# Read only numbers in MIC values\n", + "#try:\n", + "strain_mic['o_mic']=strain_mic[['MIC']].applymap(lambda x: (re.sub(r'[^0-9.\\/]','', x)))\n", + "#except TypeError:\n", + "# print('Waring: Error in MIC value')" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "ddbbe4d9", + "metadata": {}, + "outputs": [], + "source": [ + "#strain_mic" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "640508f1", + "metadata": {}, + "outputs": [], + "source": [ + "# capitalize each Antibiotic Name for comparision with removing whitespace\n", + "strain_mic['Strain name']=strain_mic['Strain name'].str.capitalize().str.replace(\" \",\"\")\n", + "strain_mic['Antibiotics']=strain_mic['Antibiotics'].str.capitalize().str.replace(\" \",\"\")\n", + "\n", + "clsi_bp['Antibiotics']=clsi_bp['Antibiotics'].str.capitalize().str.replace(\" \",\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "b87426f4", + "metadata": {}, + "outputs": [], + "source": [ + "#find duplicate values in input files\n", + "dups=strain_mic[strain_mic[['Strain name', 'Antibiotics']].duplicated(keep=False)]\n", + "if dups.shape[0] != 0:\n", + " print ('Please provide a single MIC value in input file for given duplicates combination of \\'Strain name and Antibiotics\\' to use the tool:-\\n',dups)\n", + " #exit()\n", + "else:\n", + " #compare CLSI Antibiotics only\n", + " #result=pd.merge(strain_mic, clsi_bp, on='Antibiotics',how='inner', indicator=True)[['Strain name','Antibiotics', 'MIC', 'o_mic', 's_mic', 'r_mic','_merge']]\n", + " try:\n", + " result=pd.merge(strain_mic, clsi_bp, on='Antibiotics',how='inner')[['Strain name','Antibiotics', 'MIC', 'o_mic', 's_mic', 'r_mic','i_mic']]\n", + " except KeyError:\n", + " print('Waring: Error in input Values')" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "91bfc94d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Strain name</th>\n", + " <th>Antibiotics</th>\n", + " <th>MIC</th>\n", + " <th>o_mic</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [Strain name, Antibiotics, MIC, o_mic]\n", + "Index: []" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dups.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 132, + "id": "b171f205", + "metadata": {}, + "outputs": [], + "source": [ + "#compare MIC values and assign Susceptible and Resistant to Strain\n", + "#try:\n", + "result[['CLSI_profile']] = result[['o_mic','s_mic','r_mic','i_mic']].apply(sus_res_int,axis = 1)\n", + "#except ValueError:\n", + "# print('Waring: Error in input MIC value')" + ] + }, + { + "cell_type": "code", + "execution_count": 133, + "id": "3336fd92", + "metadata": {}, + "outputs": [], + "source": [ + "#result" + ] + }, + { + "cell_type": "code", + "execution_count": 134, + "id": "f0dacfd1", + "metadata": {}, + "outputs": [], + "source": [ + "#result[['Strain name', 'Antibiotics', 'MIC','s_mic','r_mic','CLSI_profile']].to_csv(output_profile,sep=',', index=False, encoding='utf-8-sig')" + ] + }, + { + "cell_type": "code", + "execution_count": 135, + "id": "3d8d03f7", + "metadata": {}, + "outputs": [], + "source": [ + "#create a pivot table for ASIST\n", + "table=result[['Strain name', 'Antibiotics','CLSI_profile']].drop_duplicates()\n", + "result_table=pd.pivot_table(table, values ='CLSI_profile', index =['Strain name'],columns =['Antibiotics'], aggfunc = lambda x: ' '.join(x))" + ] + }, + { + "cell_type": "code", + "execution_count": 136, + "id": "7d7223a3", + "metadata": {}, + "outputs": [], + "source": [ + "#result_table" + ] + }, + { + "cell_type": "code", + "execution_count": 137, + "id": "8a41b2ef", + "metadata": {}, + "outputs": [], + "source": [ + "#result_table.to_csv(output_table,na_rep='NA')" + ] + }, + { + "cell_type": "code", + "execution_count": 138, + "id": "8c9e5f87", + "metadata": {}, + "outputs": [], + "source": [ + "# reorder the Antibiotics for ASIST\n", + "clsi_ab=['Amikacin','Tobramycin','Gentamycin','Netilmicin','Imipenem','Meropenem','Doripenem','Ciprofloxacin','Levofloxacin',\n", + " 'Piperacillin/tazobactam','Ticarcillin/clavulanicacid','Cefotaxime','Ceftriaxone','Ceftazidime','Cefepime',\n", + " 'Trimethoprim/sulfamethoxazole','Ampicillin/sulbactam','Colistin','Polymyxinb','Tetracycline','Doxicycline ',\n", + " 'Minocycline']\n", + "result_selected=result_table.filter(clsi_ab)" + ] + }, + { + "cell_type": "code", + "execution_count": 139, + "id": "cdf43afb", + "metadata": {}, + "outputs": [], + "source": [ + "#print(result_selected.shape, result_table.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 140, + "id": "c4c4df30", + "metadata": {}, + "outputs": [], + "source": [ + "#result_selected.insert(0,'Resistance_phenotype','')" + ] + }, + { + "cell_type": "code", + "execution_count": 141, + "id": "9adb2703", + "metadata": {}, + "outputs": [], + "source": [ + "#rename headers\n", + "result_selected=result_selected.rename(columns = {'Ticarcillin/clavulanicacid':'Ticarcillin/clavulanic acid','Piperacillin/tazobactam':'Piperacillin/ tazobactam','Trimethoprim/sulfamethoxazole': 'Trimethoprim/ sulfamethoxazole','Ampicillin/sulbactam':'Ampicillin/ sulbactam', 'Polymyxinb': 'Polymyxin B'} )" + ] + }, + { + "cell_type": "code", + "execution_count": 142, + "id": "50e6cf5f", + "metadata": {}, + "outputs": [], + "source": [ + "#result_selected" + ] + }, + { + "cell_type": "code", + "execution_count": 144, + "id": "2833671c", + "metadata": {}, + "outputs": [], + "source": [ + "result_selected.to_csv(output_table,na_rep='NA')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}