view clsi_profile_type2_linux.ipynb @ 11:7dcc0e93288b draft default tip

"planemo upload for repository https://github.com/rakesh4osdd/asist/tree/master commit f590c3b1d71a9b8f2030909fa488b4ac0c3caed8-dirty"
author rakesh4osdd
date Wed, 30 Jun 2021 07:13:29 +0000
parents
children
line wrap: on
line source

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "9aa0a6f7",
   "metadata": {},
   "outputs": [],
   "source": [
    "# ASIST module2 | map AST result to the CLSI breakporints with combination antibiotics\n",
    "# By rakesh4osdd@gmail.com, 06-Jun-2021\n",
    "import pandas as pd\n",
    "import re\n",
    "import sys"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "9af8387e",
   "metadata": {},
   "outputs": [],
   "source": [
    "#print(pd.__version__, re.__version__)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "73d0783c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# compare two MIC value strings\n",
    "def check_mic(mic1,mic2,mic_type):\n",
    "    #print(mic1,mic2,mic_type)\n",
    "    try:\n",
    "        if '/' in mic1:\n",
    "            m1a = mic1.split('/')[0]\n",
    "            m1b = mic1.split('/')[1]\n",
    "            if float(m1a)==0 or float(m1b)==0:\n",
    "                strain_type='Strain could not be classified'\n",
    "                return(strain_type)          \n",
    "        elif '/' in mic2:\n",
    "            m1a = mic1\n",
    "            if float(m1a)==0:\n",
    "                strain_type='Strain could not be classified'\n",
    "                return(strain_type)            \n",
    "            m1b = '1'\n",
    "        elif float(mic1)==0:\n",
    "            strain_type='Strain could not be classified'\n",
    "            return(strain_type)\n",
    "        else:\n",
    "            m1a = mic1\n",
    "            \n",
    "        if '-' in mic2:\n",
    "            m2a = mic2.split('-')[0]\n",
    "            m2b = mic2.split('-')[1]           \n",
    "         \n",
    "    except ValueError:\n",
    "        strain_type='Strain could not be classified' \n",
    "        return(strain_type)\n",
    "    try:\n",
    "        if '-' in mic2 and mic_type == 'i':   # for intermediate only\n",
    "            if '/' in mic2:\n",
    "                m2a = mic2.split('-')[0].split('/')[0]\n",
    "                m2b = mic2.split('-')[0].split('/')[1]\n",
    "                m2aa = mic2.split('-')[1].split('/')[0]\n",
    "                m2bb = mic2.split('-')[1].split('/')[1]\n",
    "                if (float(m2aa)>=float(m1a)>=float(m2a) and float(m2bb)>=float(m1b)>=float(m2b)):\n",
    "                    #print('intermediate')\n",
    "                    m_type='Intermediate'\n",
    "                else:\n",
    "                    #print('not define')\n",
    "                    m_type='Strain could not be classified'\n",
    "            else:\n",
    "                m2a = mic2.split('-')[0]\n",
    "                m2b = mic2.split('-')[1] \n",
    "                if (float(m2b)>=float(m1a)>=float(m2a)):\n",
    "                    #print('intermediate')\n",
    "                    m_type='Intermediate'\n",
    "                else:\n",
    "                    #print('not define')\n",
    "                    m_type='Strain could not be classified'                \n",
    "            #print (m1a,m1b,m2a,m2b,m2aa,m2bb)\n",
    "        elif '/' in mic2:\n",
    "            m2a = mic2.split('/')[0]\n",
    "            m2b = mic2.split('/')[1]\n",
    "            #print(m1a,m1b,m2a,m2b,mic_type)\n",
    "            if (mic_type=='s' and (float(m1a)<=float(m2a) and float(m1b)<=float(m2b))):\n",
    "                m_type='Susceptible'\n",
    "            elif (mic_type=='r' and (float(m1a)>=float(m2a) and float(m1b)>=float(m2b))):\n",
    "                m_type='Resistant'\n",
    "            elif (mic_type=='i' and (float(m1a)==float(m2a) and float(m1b)==float(m2b))):\n",
    "                m_type='Intermediate'\n",
    "            else:\n",
    "                m_type='Strain could not be classified'\n",
    "        elif '-' in mic2:\n",
    "                m_type='Strain could not be classified'\n",
    "        else:\n",
    "            m2a=mic2\n",
    "            if (mic_type=='s' and (float(m1a)<=float(m2a))):\n",
    "                m_type='Susceptible'\n",
    "            elif (mic_type=='r' and (float(m1a)>=float(m2a))):\n",
    "                m_type='Resistant'\n",
    "            elif (mic_type=='i' and (float(m1a)==float(m2a))):\n",
    "                m_type='Intermediate'\n",
    "            else:\n",
    "                m_type='Strain could not be classified'        \n",
    "    except IndexError:\n",
    "        strain_type='Strain could not be classified' \n",
    "        return(strain_type)\n",
    "    \n",
    "    return(m_type)\n",
    "\n",
    "#check_mic('65','32-64','i')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "4d2ab1b1",
   "metadata": {},
   "outputs": [],
   "source": [
    "# compare MIC value in pandas list\n",
    "def sus_res_int(mic):\n",
    "    #print(mic)\n",
    "    o_mic = mic[0].replace(' ', '')\n",
    "    s_mic = mic[1].replace(' ', '')\n",
    "    r_mic = mic[2].replace(' ', '')\n",
    "    i_mic = mic[3].replace(' ', '')\n",
    "    try:\n",
    "        if check_mic(o_mic,s_mic,'s')=='Susceptible':\n",
    "            strain_type='Susceptible'\n",
    "        elif check_mic(o_mic,r_mic,'r')=='Resistant':\n",
    "            strain_type='Resistant'\n",
    "        elif check_mic(o_mic,i_mic,'i')=='Intermediate':\n",
    "            strain_type='Intermediate'                    \n",
    "        else:\n",
    "            strain_type='Strain could not be classified'\n",
    "    except ValueError:\n",
    "        strain_type='Strain could not be classified'            \n",
    "    return(strain_type)\n",
    "\n",
    "#mic=['128','16/4','128/4','32/4-64/4']\n",
    "#sus_res_int(mic)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "0e22ef0d",
   "metadata": {},
   "outputs": [
    {
     "ename": "IndexError",
     "evalue": "list index out of range",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mIndexError\u001b[0m                                Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-11-26f141926f14>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[0minput_user\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msys\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0margv\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[0minput_clsi\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msys\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0margv\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0moutput_table\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msys\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0margv\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;31mIndexError\u001b[0m: list index out of range"
     ]
    }
   ],
   "source": [
    "# for input argument\n",
    "input_user = sys.argv[1]\n",
    "input_clsi = sys.argv[2]\n",
    "output_table = sys.argv[3]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "id": "21d5fe63",
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\"input_user='~/Jupyterlab_notebook/ASIST_module/strain_profiles_16k.csv.csv'\n",
    "#input_user='test-data/input2.csv'\n",
    "input_clsi='test-data/clsi.csv'\n",
    "output_profile='test-data/input2_profile.csv'\n",
    "#output_table='test-data/input2_table.csv'\n",
    "output_table='/home/rakesh/Jupyterlab_notebook/ASIST_module/strain_profiles_16k_table.csv'\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "id": "1e64b025",
   "metadata": {},
   "outputs": [],
   "source": [
    "# read user AST data with selected 3 columns\n",
    "strain_mic=pd.read_csv(input_user, sep=',', usecols =['Strain name', 'Antibiotics', 'MIC'],na_filter=False)\n",
    "#strain_mic"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "id": "0d30ddc3",
   "metadata": {},
   "outputs": [],
   "source": [
    "clsi_bp=pd.read_csv(input_clsi,sep=',')\n",
    "\n",
    "#clsi_bp[clsi_bp[['Antibiotics', 'Susceptible']].duplicated()].shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "id": "a818676d",
   "metadata": {},
   "outputs": [],
   "source": [
    "#clsi_bp\n",
    "#strain_mic"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "id": "c2aae757",
   "metadata": {},
   "outputs": [],
   "source": [
    "# warn user for duplicate files\n",
    "input_dups=strain_mic[strain_mic[['Strain name','Antibiotics']].duplicated()]\n",
    "if (input_dups.shape[0] == 0):\n",
    "    #print( \"No duplicates\")\n",
    "    pass\n",
    "else:\n",
    "    with open(output_table, \"w\") as file_object:\n",
    "    # Append 'hello' at the end of file\n",
    "        file_object.write('S.No.,Strain name,Antibiotics,MIC\\nInput File Error: Please remove duplicate/mutiple MIC values for same combination of Strain name and Antibiotics from input file\\n')\n",
    "    input_dups.to_csv(output_table,na_rep='NA', mode='a')\n",
    "    exit()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "c6b4c59b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# convert MIC to numbers sMIC, rMIC\n",
    "clsi_bp['s_mic'] =clsi_bp[['Susceptible']].applymap(lambda x: (re.sub(r'[^0-9.\\/-]', '', x)))\n",
    "clsi_bp['r_mic'] =clsi_bp[['Resistant']].applymap(lambda x: (re.sub(r'[^0-9.\\/-]', '', x)))\n",
    "clsi_bp['i_mic'] = clsi_bp[['Intermediate']].applymap(lambda x: (re.sub(r'[^0-9.\\/-]', '', x)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "d0171f94",
   "metadata": {},
   "outputs": [],
   "source": [
    "#clsi_bp['i_mic'] = clsi_bp[['Intermediate']].applymap(lambda x: (re.sub(r'[^0-9.\\/-]', '', x)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "fe45b2dd",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Read only numbers in MIC values\n",
    "#try:\n",
    "strain_mic['o_mic']=strain_mic[['MIC']].applymap(lambda x: (re.sub(r'[^0-9.\\/]','', x)))\n",
    "#except TypeError:\n",
    "#    print('Waring: Error in MIC value')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "ddbbe4d9",
   "metadata": {},
   "outputs": [],
   "source": [
    "#strain_mic"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "640508f1",
   "metadata": {},
   "outputs": [],
   "source": [
    "# capitalize each Antibiotic Name for comparision with removing whitespace\n",
    "strain_mic['Strain name']=strain_mic['Strain name'].str.capitalize().str.replace(\" \",\"\")\n",
    "strain_mic['Antibiotics']=strain_mic['Antibiotics'].str.capitalize().str.replace(\" \",\"\")\n",
    "\n",
    "clsi_bp['Antibiotics']=clsi_bp['Antibiotics'].str.capitalize().str.replace(\" \",\"\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "b87426f4",
   "metadata": {},
   "outputs": [],
   "source": [
    "#find duplicate values in input files\n",
    "dups=strain_mic[strain_mic[['Strain name', 'Antibiotics']].duplicated(keep=False)]\n",
    "if dups.shape[0] != 0:\n",
    "    print ('Please provide a single MIC value in input file for given duplicates combination of \\'Strain name and Antibiotics\\' to use the tool:-\\n',dups)\n",
    "    #exit()\n",
    "else:\n",
    "    #compare CLSI Antibiotics only\n",
    "    #result=pd.merge(strain_mic, clsi_bp, on='Antibiotics',how='inner',  indicator=True)[['Strain name','Antibiotics', 'MIC', 'o_mic', 's_mic', 'r_mic','_merge']]\n",
    "    try:\n",
    "        result=pd.merge(strain_mic, clsi_bp, on='Antibiotics',how='inner')[['Strain name','Antibiotics', 'MIC', 'o_mic', 's_mic', 'r_mic','i_mic']]\n",
    "    except KeyError:\n",
    "        print('Waring: Error in input Values')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "91bfc94d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Strain name</th>\n",
       "      <th>Antibiotics</th>\n",
       "      <th>MIC</th>\n",
       "      <th>o_mic</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Empty DataFrame\n",
       "Columns: [Strain name, Antibiotics, MIC, o_mic]\n",
       "Index: []"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dups.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 132,
   "id": "b171f205",
   "metadata": {},
   "outputs": [],
   "source": [
    "#compare MIC values and assign Susceptible and Resistant to Strain\n",
    "#try:\n",
    "result[['CLSI_profile']] = result[['o_mic','s_mic','r_mic','i_mic']].apply(sus_res_int,axis = 1)\n",
    "#except ValueError:\n",
    "#    print('Waring: Error in input MIC value')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 133,
   "id": "3336fd92",
   "metadata": {},
   "outputs": [],
   "source": [
    "#result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 134,
   "id": "f0dacfd1",
   "metadata": {},
   "outputs": [],
   "source": [
    "#result[['Strain name', 'Antibiotics', 'MIC','s_mic','r_mic','CLSI_profile']].to_csv(output_profile,sep=',', index=False, encoding='utf-8-sig')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 135,
   "id": "3d8d03f7",
   "metadata": {},
   "outputs": [],
   "source": [
    "#create a pivot table for ASIST\n",
    "table=result[['Strain name', 'Antibiotics','CLSI_profile']].drop_duplicates()\n",
    "result_table=pd.pivot_table(table, values ='CLSI_profile', index =['Strain name'],columns =['Antibiotics'], aggfunc = lambda x: ' '.join(x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 136,
   "id": "7d7223a3",
   "metadata": {},
   "outputs": [],
   "source": [
    "#result_table"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 137,
   "id": "8a41b2ef",
   "metadata": {},
   "outputs": [],
   "source": [
    "#result_table.to_csv(output_table,na_rep='NA')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 138,
   "id": "8c9e5f87",
   "metadata": {},
   "outputs": [],
   "source": [
    "# reorder the Antibiotics for ASIST\n",
    "clsi_ab=['Amikacin','Tobramycin','Gentamycin','Netilmicin','Imipenem','Meropenem','Doripenem','Ciprofloxacin','Levofloxacin',\n",
    "         'Piperacillin/tazobactam','Ticarcillin/clavulanicacid','Cefotaxime','Ceftriaxone','Ceftazidime','Cefepime',\n",
    "         'Trimethoprim/sulfamethoxazole','Ampicillin/sulbactam','Colistin','Polymyxinb','Tetracycline','Doxicycline ',\n",
    "         'Minocycline']\n",
    "result_selected=result_table.filter(clsi_ab)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 139,
   "id": "cdf43afb",
   "metadata": {},
   "outputs": [],
   "source": [
    "#print(result_selected.shape, result_table.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 140,
   "id": "c4c4df30",
   "metadata": {},
   "outputs": [],
   "source": [
    "#result_selected.insert(0,'Resistance_phenotype','')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 141,
   "id": "9adb2703",
   "metadata": {},
   "outputs": [],
   "source": [
    "#rename headers\n",
    "result_selected=result_selected.rename(columns = {'Ticarcillin/clavulanicacid':'Ticarcillin/clavulanic acid','Piperacillin/tazobactam':'Piperacillin/ tazobactam','Trimethoprim/sulfamethoxazole': 'Trimethoprim/ sulfamethoxazole','Ampicillin/sulbactam':'Ampicillin/ sulbactam', 'Polymyxinb': 'Polymyxin B'} )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 142,
   "id": "50e6cf5f",
   "metadata": {},
   "outputs": [],
   "source": [
    "#result_selected"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 144,
   "id": "2833671c",
   "metadata": {},
   "outputs": [],
   "source": [
    "result_selected.to_csv(output_table,na_rep='NA')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}