diff --git a/module3/exo3/exercice.ipynb b/module3/exo3/exercice.ipynb index 162b3a48583d248f4ad8c6a1ebd2b1ee6fcba5bd..3cee773652f94f7025e8ac173c6503bbfa4effab 100644 --- a/module3/exo3/exercice.ipynb +++ b/module3/exo3/exercice.ipynb @@ -3,8 +3,8 @@ { "cell_type": "markdown", "metadata": { - "hideCode": true, - "hidePrompt": true + "hideCode": false, + "hidePrompt": false }, "source": [ "# Sujet 6 : Autour du Paradoxe de Simpson" @@ -14,8 +14,8 @@ "cell_type": "code", "execution_count": 1, "metadata": { - "hideCode": true, - "hidePrompt": true + "hideCode": false, + "hidePrompt": false }, "outputs": [], "source": [ @@ -28,10 +28,12 @@ { "cell_type": "markdown", "metadata": { - "hideCode": true, - "hidePrompt": true + "hideCode": false, + "hidePrompt": false }, "source": [ + "## Dataset load\n", + "\n", "We start by load the data and to store it locally if it is not already stored" ] }, @@ -39,8 +41,8 @@ "cell_type": "code", "execution_count": 2, "metadata": { - "hideCode": true, - "hidePrompt": true + "hideCode": false, + "hidePrompt": false }, "outputs": [], "source": [ @@ -52,8 +54,9 @@ "cell_type": "code", "execution_count": 3, "metadata": { - "hideCode": true, - "hidePrompt": true + "hideCode": false, + "hideOutput": true, + "hidePrompt": false }, "outputs": [ { @@ -539,22 +542,29 @@ "\n", "import os \n", "if os.path.exists(chemin):\n", - " raw_data = pd.read_csv(chemin)\n", + " df = pd.read_csv(chemin)\n", " #, skiprows=1)\n", "else :\n", " store_data_locally()\n", - " raw_data = pd.read_csv(chemin)\n", + " df = pd.read_csv(chemin)\n", " #, skiprows=1)\n", " \n", - "raw_data" + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Dataset Exploration" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 4, "metadata": { - "hideCode": true, - "hidePrompt": true + "hideCode": false, + "hidePrompt": false }, "outputs": [ { @@ -563,21 +573,21 @@ "(1314, 3)" ] }, - "execution_count": 12, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "raw_data.shape" + "df.shape" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": { - "hideCode": true, - "hidePrompt": true + "hideCode": false, + "hidePrompt": false }, "outputs": [ { @@ -650,42 +660,76 @@ "4 Yes Alive 81.4" ] }, - "execution_count": 4, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "raw_data.head()" + "df.head()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": { - "hideCode": true, - "hidePrompt": true + "hideCode": false, + "hidePrompt": false }, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['Smoker', 'Status', 'Age'], dtype='object')" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.columns" + ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": { - "hideCode": true, + "hideCode": false, "hideOutput": true, - "hidePrompt": true + "hidePrompt": false }, - "outputs": [], - "source": [] + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Smoker Status Age\n", + "count 1314 1314 1314.000000\n", + "unique 2 2 NaN\n", + "top No Alive NaN\n", + "freq 732 945 NaN\n", + "mean NaN NaN 47.359361\n", + "std NaN NaN 19.160667\n", + "min NaN NaN 18.000000\n", + "25% NaN NaN 31.300000\n", + "50% NaN NaN 44.800000\n", + "75% NaN NaN 60.600000\n", + "max NaN NaN 89.900000\n" + ] + } + ], + "source": [ + "print(df.describe(include='all'))" + ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": { - "hideCode": true, - "hidePrompt": true + "hideCode": false, + "hidePrompt": false }, "outputs": [ { @@ -704,14 +748,37 @@ } ], "source": [ - "raw_data.info()" + "df.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": { + "hideCode": false, + "hidePrompt": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "1314" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(df)" ] }, { "cell_type": "markdown", "metadata": { - "hideCode": true, - "hidePrompt": true + "hideCode": false, + "hidePrompt": false }, "source": [ "So this dataset has information about 1314 women and there are no missing values" @@ -719,72 +786,1399 @@ }, { "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Some graphics" + ] + }, + { + "cell_type": "code", + "execution_count": 45, "metadata": { - "hideCode": true, - "hidePrompt": true + "hideOutput": false }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[]],\n", + " dtype=object)" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXoAAAEICAYAAABRSj9aAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvIxREBQAAFa9JREFUeJzt3X+Q3Hd93/HnqyJxjY/agM1F2AaZGYcULFDQjaGFeO5wIObHQKCB2iXELiSCGWih1UxrkhRIKDOQYmhnnB8VsTENiQ8XY3DNj9jjIkg644AEJpIxBhsUYxlkwMZG4HEq8+4f+1V9KCfteX/c7X70fMzs3O7n+93vvnZ1et33Pvvd76WqkCS16x+tdQBJ0nhZ9JLUOItekhpn0UtS4yx6SWqcRS9JjbPoJalxFr2Oakm2J7knyTFrnUUaF4teR60kG4BfAgp4yZqGkcbIotfR7DeAG4DLgPMPDiZ5bJL/leS+JF9I8p+T/PWS5b+Q5Lokdye5JckrVz+6tHKPWOsA0hr6DeC9wN8ANySZrap9wB8CPwJ+DtgA/CXwdwBJjgOuA94KvAB4GnBtkpuq6qZVfwbSCrhHr6NSkucATwSuqKqdwG3Av0qyDvgXwNuq6sdV9RXgg0vu+mJgT1V9oKoOVNUXgSuBX1vlpyCtmEWvo9X5wLVV9b3u9l90YyfR+033W0vWXXr9icAzk/zg4AV4Fb29f2kiOXWjo06SY4FXAuuSfKcbPgY4AZgFDgCnAF/rlp265O7fAj5bVc9bpbjS0OJpinW0SXIevXn4TcDfL1l0BfAFeiX/IPCbwBOAa4Hbq+o5SR4F7AZ+F1js7rcJ2F9VN6/OM5AeHqdudDQ6H/hAVd1eVd85eAEupjcN80bgeOA7wJ8BlwMPAFTVD4HnA+cCd3brvJvebwTSRHKPXuojybuBn6uq8/uuLE0g9+ilQ3THyT8tPWcCrwWuWutc0qB8M1b6hx5Fb7rm8cBdwEXAx9c0kTQEp24kqXFO3UhS4yZi6ubEE0+sDRs2jG37P/rRjzjuuOPGtv1RmZacMD1ZzTl605L1aMi5c+fO71XVSX1XrKo1v2zevLnG6TOf+cxYtz8q05KzanqymnP0piXr0ZAT2FEr6FinbiSpcRa9JDXOopekxln0ktQ4i16SGmfRS1LjLHpJapxFL0mNs+glqXETcQoEPTwbLvzEmj32nne9aM0eW9Jg3KOXpMZZ9JLUOItekhrXxBx9vznrrRsPcMEY5rWdr5Y0Dfru0Se5NMldSXYvGftwkhu7y54kN3bjG5Lcv2TZn4wzvCSpv5Xs0V8GXAz8j4MDVfUvD15PchFw75L1b6uqTaMKKEkaTt+ir6rPJdmw3LIkAV4JPHe0sSRJo7KiPw7eFf01VXXGIeNnAe+tqrkl690EfA24D/jdqvqrw2xzC7AFYHZ2dvPi4uKgz4Fde+894vLZY2Hf/QNv/rA2nnz8SLe3f/9+ZmZm+q7X7/mO08HnvNKsa82cozctWY+GnAsLCzsP9u+RDPtm7HnA5Utufxt4QlV9P8lm4GNJnlpV9x16x6raBmwDmJubq/n5+YFD9HujdevGA1y0a/TvO+951fxIt7d9+3ZW8jqM443llTr4nFeada2Zc/SmJas5HzLw4ZVJHgG8HPjwwbGqeqCqvt9d3wncBvz8sCElSYMb5jj6Xwa+WlV3HBxIclKSdd31JwGnA98YLqIkaRh95zOSXA7MAycmuQN4W1VdApzLT0/bAJwF/H6SA8CDwOur6u7RRp4coz7nzLiO95d0dFvJUTfnHWb8gmXGrgSuHD6WJGlUPAWCJDXOopekxln0ktQ4i16SGmfRS1LjLHpJalwT56OXxmk1/kbvcp+h8O8daFTco5ekxln0ktQ4i16SGmfRS1LjfDNWD8vBNyZX+wRsvjEpDc49eklqnEUvSY2z6CWpcRa9JDXOopekxln0ktQ4i16SGte36JNcmuSuJLuXjL09yd4kN3aXFy5Z9pYktya5JcmvjCu4JGllVrJHfxlwzjLj76uqTd3lkwBJngKcCzy1u88fJVk3qrCSpIevb9FX1eeAu1e4vZcCi1X1QFV9E7gVOHOIfJKkIaWq+q+UbACuqaozuttvBy4A7gN2AFur6p4kFwM3VNWHuvUuAT5VVR9ZZptbgC0As7OzmxcXFwd+Erv23nvE5bPHwr77B978qpmWnLD6WTeefPxA99u/fz8zMzNDPXa/769RWO71HPQ5j9soXtPVcDTkXFhY2FlVc/3WG/RcN38MvAOo7utFwGuALLPusj9JqmobsA1gbm6u5ufnB4xC33OubN14gIt2Tf5pfaYlJ6x+1j2vmh/oftu3b2eY7y3o//01Csu9noM+53EbxWu6Gsz5kIGOuqmqfVX1YFX9BHg/D03P3AGcumTVU4A7h4soSRrGQEWfZP2Smy8DDh6RczVwbpJjkpwGnA58friIkqRh9P3dO8nlwDxwYpI7gLcB80k20ZuW2QO8DqCqbkpyBfAV4ADwhqp6cDzRJUkr0bfoq+q8ZYYvOcL67wTeOUwoSdLoTMc7fzrqbRjwDdHV/gMp0iTyFAiS1DiLXpIaZ9FLUuMseklqnEUvSY2z6CWpcRa9JDXOopekxln0ktQ4i16SGmfRS1LjLHpJapxFL0mNs+glqXEWvSQ1zqKXpMZZ9JLUOItekhrXt+iTXJrkriS7l4z9lyRfTfK3Sa5KckI3viHJ/Ulu7C5/Ms7wkqT+VrJHfxlwziFj1wFnVNXTgK8Bb1my7Laq2tRdXj+amJKkQfUt+qr6HHD3IWPXVtWB7uYNwCljyCZJGoFRzNG/BvjUktunJflSks8m+aURbF+SNIRUVf+Vkg3ANVV1xiHjvwPMAS+vqkpyDDBTVd9Pshn4GPDUqrpvmW1uAbYAzM7Obl5cXBz4Sezae+8Rl88eC/vuH3jzq2ZacsL0ZJ3mnBtPPn5twvSxf/9+ZmZm1jpGX0dDzoWFhZ1VNddvvUcMtHUgyfnAi4Gzq/tpUVUPAA9013cmuQ34eWDHofevqm3ANoC5ubman58fNAoXXPiJIy7fuvEAF+0a+KmummnJCdOTdZpz7nnV/NqE6WP79u0M8/91tZjzIQNN3SQ5B/iPwEuq6sdLxk9Ksq67/iTgdOAbowgqSRpM312dJJcD88CJSe4A3kbvKJtjgOuSANzQHWFzFvD7SQ4ADwKvr6q7l92wJGlV9C36qjpvmeFLDrPulcCVw4aSJI2On4yVpMZZ9JLUOItekhpn0UtS4yx6SWqcRS9JjbPoJalxFr0kNc6il6TGTf7ZnqSj1IY+J+sbpz3vetGaPbZGzz16SWqcRS9JjbPoJalxFr0kNc6il6TGWfSS1DiLXpIaZ9FLUuMseklqnEUvSY3rW/RJLk1yV5LdS8Yek+S6JF/vvj56ybK3JLk1yS1JfmVcwSVJK7OSPfrLgHMOGbsQuL6qTgeu726T5CnAucBTu/v8UZJ1I0srSXrY+hZ9VX0OuPuQ4ZcCH+yufxD41SXji1X1QFV9E7gVOHNEWSVJA0hV9V8p2QBcU1VndLd/UFUnLFl+T1U9OsnFwA1V9aFu/BLgU1X1kWW2uQXYAjA7O7t5cXFx4Cexa++9R1w+eyzsu3/gza+aackJ05PVnIPZePLxh122f/9+ZmZmVjHNYI6GnAsLCzuraq7feqM+TXGWGVv2J0lVbQO2AczNzdX8/PzAD3pBn9O5bt14gIt2Tf4ZmaclJ0xPVnMOZs+r5g+7bPv27Qzz/3W1mPMhgx51sy/JeoDu613d+B3AqUvWOwW4c/B4kqRhDVr0VwPnd9fPBz6+ZPzcJMckOQ04Hfj8cBElScPo+7tiksuBeeDEJHcAbwPeBVyR5LXA7cArAKrqpiRXAF8BDgBvqKoHx5RdkrQCfYu+qs47zKKzD7P+O4F3DhNKkjQ6fjJWkhpn0UtS4yx6SWqcRS9JjbPoJalxFr0kNc6il6TGWfSS1DiLXpIaZ9FLUuMseklqnEUvSY2z6CWpcRa9JDXOopekxln0ktQ4i16SGmfRS1LjLHpJalzfvxl7OEmeDHx4ydCTgLcCJwC/BXy3G//tqvrkwAklSUMZuOir6hZgE0CSdcBe4CrgXwPvq6r3jCShJGkoo5q6ORu4rar+bkTbkySNSKpq+I0klwJfrKqLk7wduAC4D9gBbK2qe5a5zxZgC8Ds7OzmxcXFgR9/1957j7h89ljYd//Am18105ITpierOQez8eTjD7ts//79zMzMrGKawTycnP06ZJxOO37dwK/nwsLCzqqa67fe0EWf5GeBO4GnVtW+JLPA94AC3gGsr6rXHGkbc3NztWPHjoEzbLjwE0dcvnXjAS7aNfAs1aqZlpwwPVnNOZg973rRYZdt376d+fn51QszoIeTs1+HjNNl5xw38OuZZEVFP4qpmxfQ25vfB1BV+6rqwar6CfB+4MwRPIYkaUCjKPrzgMsP3kiyfsmylwG7R/AYkqQBDfW7YpJHAs8DXrdk+A+SbKI3dbPnkGWSdFijnELZuvEAF6zhlMwkGaroq+rHwGMPGXv1UIkkSSPlJ2MlqXEWvSQ1zqKXpMZZ9JLUOItekhpn0UtS4yx6SWqcRS9JjbPoJalxFr0kNc6il6TGWfSS1DiLXpIaZ9FLUuMseklqnEUvSY2z6CWpcZPzZ+clTYwj/Uk//0Tf9HGPXpIaZ9FLUuOGmrpJsgf4IfAgcKCq5pI8BvgwsAHYA7yyqu4ZLqYkaVCj2KNfqKpNVTXX3b4QuL6qTgeu725LktbIOKZuXgp8sLv+QeBXx/AYkqQVSlUNfufkm8A9QAH/vaq2JflBVZ2wZJ17qurRy9x3C7AFYHZ2dvPi4uLAOXbtvfeIy2ePhX33D7z5VTMtOWF6sppz9KYl67TkPO34dczMzAx034WFhZ1LZlMOa9iif3xV3ZnkccB1wL8Brl5J0S81NzdXO3bsGDjHkQ4Fg97hYBftmvwjSaclJ0xPVnOO3rRknZacl51zHPPz8wPdN8mKin6oqZuqurP7ehdwFXAmsC/J+i7EeuCuYR5DkjScgYs+yXFJHnXwOvB8YDdwNXB+t9r5wMeHDSlJGtwwv9fMAlclObidv6iqTyf5AnBFktcCtwOvGD6mJGlQAxd9VX0DePoy498Hzh4mlCRpdPxkrCQ1zqKXpMZZ9JLUOItekhpn0UtS4yx6SWqcRS9JjbPoJalxFr0kNc6il6TGWfSS1DiLXpIaZ9FLUuMseklqnEUvSY2z6CWpcRa9JDXOopekxln0ktS4gYs+yalJPpPk5iQ3JXlTN/72JHuT3NhdXji6uJKkh2vgPw4OHAC2VtUXkzwK2Jnkum7Z+6rqPcPHkyQNa+Cir6pvA9/urv8wyc3AyaMKJkkajVTV8BtJNgCfA84A/j1wAXAfsIPeXv89y9xnC7AFYHZ2dvPi4uLAj79r771HXD57LOy7f+DNr5ppyQnTk9WcozctWacl52nHr2NmZmag+y4sLOysqrl+6w1d9ElmgM8C76yqjyaZBb4HFPAOYH1VveZI25ibm6sdO3YMnGHDhZ844vKtGw9w0a5hZqlWx7TkhOnJas7Rm5as05LzsnOOY35+fqD7JllR0Q911E2SnwGuBP68qj4KUFX7qurBqvoJ8H7gzGEeQ5I0nGGOuglwCXBzVb13yfj6Jau9DNg9eDxJ0rCG+b3m2cCrgV1JbuzGfhs4L8kmelM3e4DXDZVQkjSUYY66+Wsgyyz65OBxJEmj5idjJalxFr0kNc6il6TGWfSS1DiLXpIaZ9FLUuMseklqnEUvSY2z6CWpcRa9JDXOopekxln0ktQ4i16SGmfRS1LjLHpJapxFL0mNs+glqXEWvSQ1zqKXpMZZ9JLUuLEVfZJzktyS5NYkF47rcSRJRzaWok+yDvhD4AXAU4DzkjxlHI8lSTqyce3RnwncWlXfqKq/BxaBl47psSRJR5CqGv1Gk18Dzqmq3+xuvxp4ZlW9cck6W4At3c0nA7eMPMhDTgS+N8btj8q05ITpyWrO0ZuWrEdDzidW1Un9VnrEgBvvJ8uM/dRPlKraBmwb0+P/dJhkR1XNrcZjDWNacsL0ZDXn6E1LVnM+ZFxTN3cApy65fQpw55geS5J0BOMq+i8Apyc5LcnPAucCV4/psSRJRzCWqZuqOpDkjcBfAuuAS6vqpnE81gqtyhTRCExLTpierOYcvWnJas7OWN6MlSRNDj8ZK0mNs+glqXFNFX2SU5N8JsnNSW5K8qZu/DFJrkvy9e7roycg6z9O8vkkX+6y/t6kZoXep52TfCnJNd3ticuZZE+SXUluTLJjUnMCJDkhyUeSfLX7fv1nk5Y1yZO71/Lg5b4kb560nF3Wf9f9P9qd5PLu/9fE5QRI8qYu501J3tyNjTVrU0UPHAC2VtU/BZ4FvKE79cKFwPVVdTpwfXd7rT0APLeqng5sAs5J8iwmMyvAm4Cbl9ye1JwLVbVpyXHJk5rzvwGfrqpfAJ5O77WdqKxVdUv3Wm4CNgM/Bq5iwnImORn4t8BcVZ1B7wCQc5mwnABJzgB+i97ZA54OvDjJ6Yw7a1U1ewE+DjyP3qdu13dj64Fb1jrbITkfCXwReOYkZqX3OYjrgecC13Rjk5hzD3DiIWOTmPOfAN+kOxhikrMuyfZ84P9MYk7gZOBbwGPoHUl4TZd3onJ2OV4B/OmS2/8J+A/jztraHv3/l2QD8IvA3wCzVfVtgO7r49Yu2UO66ZAbgbuA66pqUrP+V3rfjD9ZMjaJOQu4NsnO7hQbMJk5nwR8F/hANx32p0mOYzKzHnQucHl3faJyVtVe4D3A7cC3gXur6lomLGdnN3BWkscmeSTwQnofLh1r1iaLPskMcCXw5qq6b63zHE5VPVi9X4tPAc7sfq2bKEleDNxVVTvXOssKPLuqnkHvrKlvSHLWWgc6jEcAzwD+uKp+EfgREzCtcDjdhx5fAvzPtc6ynG4++6XAacDjgeOS/PraplpeVd0MvBu4Dvg08GV6U85j1VzRJ/kZeiX/51X10W54X5L13fL19PagJ0ZV/QDYDpzD5GV9NvCSJHvonYX0uUk+xOTlpKru7L7eRW8u+UwmMCe9U4Tc0f0GB/AResU/iVmh94Pzi1W1r7s9aTl/GfhmVX23qv4v8FHgnzN5OQGoqkuq6hlVdRZwN/B1xpy1qaJPEuAS4Oaqeu+SRVcD53fXz6c3d7+mkpyU5ITu+rH0vlm/yoRlraq3VNUpVbWB3q/v/7uqfp0Jy5nkuCSPOnid3hztbiYsJ0BVfQf4VpInd0NnA19hArN2zuOhaRuYvJy3A89K8siuA86m9+b2pOUEIMnjuq9PAF5O77Udb9a1fnNixG90PIfePO3fAjd2lxcCj6X3ZuLXu6+PmYCsTwO+1GXdDby1G5+4rEsyz/PQm7ETlZPevPeXu8tNwO9MYs4leTcBO7p//48Bj57ErPQOFPg+cPySsUnM+Xv0dpR2A38GHDOJObusf0XvB/uXgbNX4zX1FAiS1Limpm4kSf+QRS9JjbPoJalxFr0kNc6il6TGWfSS1DiLXpIa9/8AdlFJ2wbqT4UAAAAASUVORK5CYII=\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "df.hist(column='Age')" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "df['Age'].plot.kde()" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "#comparaison des distributions avec un boxplot\n", + "df.boxplot(column='Age',by='Status')" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 68, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], "source": [ - "1. Question 1\n", + "d={'Alive':1,'Dead':0}\n", + "#df2 = df[['Status','Age']]\n", + "#df2\n", + "new_df=[d[t] for t in df['Status']]\n", + "#df.plot.scatter(x='Age',y='Status')\n", + "#,c=''\n", + "df['Status_2']=new_df\n", + "df\n", "\n", - "Représentez dans un tableau le nombre total de femmes vivantes et décédées sur la période en fonction de leur habitude de tabagisme. \n", + "df.plot.scatter(x='Age',y='Status_2')\n", + "#,c='')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Dataset analysis" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "hideCode": false, + "hidePrompt": false + }, + "source": [ + "**1. Question 1**\n", "\n", - "Calculez dans chaque groupe (fumeuses / non fumeuses) le taux de mortalité (le rapport entre le nombre de femmes décédées dans un groupe et le nombre total de femmes dans ce groupe). Vous pourrez proposer une représentation graphique de ces données et calculer des intervalles de confiance si vous le souhaitez. En quoi ce résultat est-il surprenant ?" + "Représentez dans un tableau le nombre total de femmes vivantes et décédées sur la période en fonction de leur habitude de tabagisme. " ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 11, "metadata": { - "hideCode": true, - "hidePrompt": true + "hideCode": false, + "hidePrompt": false }, "outputs": [ { - "ename": "KeyError", - "evalue": "False", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/pandas/core/indexes/base.py\u001b[0m in \u001b[0;36mget_loc\u001b[0;34m(self, key, method, tolerance)\u001b[0m\n\u001b[1;32m 2524\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2525\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_loc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2526\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mKeyError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32mpandas/_libs/index.pyx\u001b[0m in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n", - "\u001b[0;32mpandas/_libs/index.pyx\u001b[0m in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n", - "\u001b[0;32mpandas/_libs/hashtable_class_helper.pxi\u001b[0m in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n", - "\u001b[0;32mpandas/_libs/hashtable_class_helper.pxi\u001b[0m in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n", - "\u001b[0;31mKeyError\u001b[0m: False", - "\nDuring handling of the above exception, another exception occurred:\n", - "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mraw_data\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'Status'\u001b[0m\u001b[0;34m==\u001b[0m\u001b[0;34m\"Alive\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 2137\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_getitem_multilevel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2138\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2139\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_getitem_column\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2140\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2141\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_getitem_column\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36m_getitem_column\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 2144\u001b[0m \u001b[0;31m# get column\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2145\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mis_unique\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2146\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_item_cache\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2147\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2148\u001b[0m \u001b[0;31m# duplicate columns & possible reduce dimensionality\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/pandas/core/generic.py\u001b[0m in \u001b[0;36m_get_item_cache\u001b[0;34m(self, item)\u001b[0m\n\u001b[1;32m 1840\u001b[0m \u001b[0mres\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcache\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mitem\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1841\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mres\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1842\u001b[0;31m \u001b[0mvalues\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_data\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mitem\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1843\u001b[0m \u001b[0mres\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_box_item_values\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mitem\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalues\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1844\u001b[0m \u001b[0mcache\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mitem\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mres\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/pandas/core/internals.py\u001b[0m in \u001b[0;36mget\u001b[0;34m(self, item, fastpath)\u001b[0m\n\u001b[1;32m 3841\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3842\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0misna\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mitem\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3843\u001b[0;31m \u001b[0mloc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mitems\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_loc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mitem\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3844\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3845\u001b[0m \u001b[0mindexer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0marange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mitems\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0misna\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mitems\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/pandas/core/indexes/base.py\u001b[0m in \u001b[0;36mget_loc\u001b[0;34m(self, key, method, tolerance)\u001b[0m\n\u001b[1;32m 2525\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_loc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2526\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mKeyError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2527\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_loc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_maybe_cast_indexer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2528\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2529\u001b[0m \u001b[0mindexer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_indexer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmethod\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtolerance\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtolerance\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32mpandas/_libs/index.pyx\u001b[0m in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n", - "\u001b[0;32mpandas/_libs/index.pyx\u001b[0m in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n", - "\u001b[0;32mpandas/_libs/hashtable_class_helper.pxi\u001b[0m in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n", - "\u001b[0;32mpandas/_libs/hashtable_class_helper.pxi\u001b[0m in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n", - "\u001b[0;31mKeyError\u001b[0m: False" - ] + "data": { + "text/plain": [ + "count 1314\n", + "unique 2\n", + "top Alive\n", + "freq 945\n", + "Name: Status, dtype: object" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "raw_data['Status'==\"Alive\"]" + "df['Status'].describe()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": { - "hideCode": true, - "hidePrompt": true + "hideCode": false, + "hidePrompt": false }, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/plain": [ + "Alive 945\n", + "Dead 369\n", + "Name: Status, dtype: int64" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['Status'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "hideOutput": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SmokerStatusAge
0YesAlive21.0
1YesAlive19.3
3NoAlive47.1
4YesAlive81.4
5NoAlive36.8
6NoAlive23.8
8YesAlive24.8
9YesAlive49.5
10YesAlive30.0
12YesAlive49.2
13NoAlive58.4
15NoAlive25.1
16NoAlive43.5
17NoAlive27.1
18NoAlive58.3
19YesAlive65.7
21YesAlive38.3
22NoAlive33.4
24NoAlive18.0
25NoAlive56.2
26YesAlive59.2
27NoAlive25.8
29NoAlive20.2
30YesAlive34.6
31YesAlive51.9
32YesAlive49.9
33NoAlive19.4
34NoAlive56.9
35YesAlive46.7
36YesAlive44.4
............
1273YesAlive55.7
1274NoAlive25.7
1275NoAlive19.5
1276YesAlive58.5
1277NoAlive23.4
1278YesAlive43.7
1279NoAlive34.4
1281NoAlive34.9
1282YesAlive51.2
1285YesAlive48.3
1286NoAlive63.1
1287NoAlive60.8
1289NoAlive36.7
1290NoAlive63.8
1292NoAlive57.7
1293NoAlive63.2
1294NoAlive46.6
1296YesAlive38.3
1297YesAlive32.7
1298NoAlive39.7
1301NoAlive20.5
1302NoAlive44.4
1303YesAlive31.2
1304YesAlive47.8
1305YesAlive60.9
1307YesAlive43.0
1308NoAlive42.1
1309YesAlive35.9
1310NoAlive22.3
1313NoAlive39.1
\n", + "

945 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " Smoker Status Age\n", + "0 Yes Alive 21.0\n", + "1 Yes Alive 19.3\n", + "3 No Alive 47.1\n", + "4 Yes Alive 81.4\n", + "5 No Alive 36.8\n", + "6 No Alive 23.8\n", + "8 Yes Alive 24.8\n", + "9 Yes Alive 49.5\n", + "10 Yes Alive 30.0\n", + "12 Yes Alive 49.2\n", + "13 No Alive 58.4\n", + "15 No Alive 25.1\n", + "16 No Alive 43.5\n", + "17 No Alive 27.1\n", + "18 No Alive 58.3\n", + "19 Yes Alive 65.7\n", + "21 Yes Alive 38.3\n", + "22 No Alive 33.4\n", + "24 No Alive 18.0\n", + "25 No Alive 56.2\n", + "26 Yes Alive 59.2\n", + "27 No Alive 25.8\n", + "29 No Alive 20.2\n", + "30 Yes Alive 34.6\n", + "31 Yes Alive 51.9\n", + "32 Yes Alive 49.9\n", + "33 No Alive 19.4\n", + "34 No Alive 56.9\n", + "35 Yes Alive 46.7\n", + "36 Yes Alive 44.4\n", + "... ... ... ...\n", + "1273 Yes Alive 55.7\n", + "1274 No Alive 25.7\n", + "1275 No Alive 19.5\n", + "1276 Yes Alive 58.5\n", + "1277 No Alive 23.4\n", + "1278 Yes Alive 43.7\n", + "1279 No Alive 34.4\n", + "1281 No Alive 34.9\n", + "1282 Yes Alive 51.2\n", + "1285 Yes Alive 48.3\n", + "1286 No Alive 63.1\n", + "1287 No Alive 60.8\n", + "1289 No Alive 36.7\n", + "1290 No Alive 63.8\n", + "1292 No Alive 57.7\n", + "1293 No Alive 63.2\n", + "1294 No Alive 46.6\n", + "1296 Yes Alive 38.3\n", + "1297 Yes Alive 32.7\n", + "1298 No Alive 39.7\n", + "1301 No Alive 20.5\n", + "1302 No Alive 44.4\n", + "1303 Yes Alive 31.2\n", + "1304 Yes Alive 47.8\n", + "1305 Yes Alive 60.9\n", + "1307 Yes Alive 43.0\n", + "1308 No Alive 42.1\n", + "1309 Yes Alive 35.9\n", + "1310 No Alive 22.3\n", + "1313 No Alive 39.1\n", + "\n", + "[945 rows x 3 columns]" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.loc[df['Status']==\"Alive\",:]" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SmokerNoYes
Status
Alive502443
Dead230139
\n", + "
" + ], + "text/plain": [ + "Smoker No Yes\n", + "Status \n", + "Alive 502 443\n", + "Dead 230 139" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#\n", + "pd.crosstab(df['Status'],df['Smoker'])\n", + "#],normalize='index'" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": { + "hideOutput": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SmokerStatusAge
0YesAlive21.0
1YesAlive19.3
3NoAlive47.1
4YesAlive81.4
5NoAlive36.8
6NoAlive23.8
8YesAlive24.8
9YesAlive49.5
10YesAlive30.0
12YesAlive49.2
13NoAlive58.4
15NoAlive25.1
16NoAlive43.5
17NoAlive27.1
18NoAlive58.3
19YesAlive65.7
21YesAlive38.3
22NoAlive33.4
24NoAlive18.0
25NoAlive56.2
26YesAlive59.2
27NoAlive25.8
29NoAlive20.2
30YesAlive34.6
31YesAlive51.9
32YesAlive49.9
33NoAlive19.4
34NoAlive56.9
35YesAlive46.7
36YesAlive44.4
............
1273YesAlive55.7
1274NoAlive25.7
1275NoAlive19.5
1276YesAlive58.5
1277NoAlive23.4
1278YesAlive43.7
1279NoAlive34.4
1281NoAlive34.9
1282YesAlive51.2
1285YesAlive48.3
1286NoAlive63.1
1287NoAlive60.8
1289NoAlive36.7
1290NoAlive63.8
1292NoAlive57.7
1293NoAlive63.2
1294NoAlive46.6
1296YesAlive38.3
1297YesAlive32.7
1298NoAlive39.7
1301NoAlive20.5
1302NoAlive44.4
1303YesAlive31.2
1304YesAlive47.8
1305YesAlive60.9
1307YesAlive43.0
1308NoAlive42.1
1309YesAlive35.9
1310NoAlive22.3
1313NoAlive39.1
\n", + "

945 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " Smoker Status Age\n", + "0 Yes Alive 21.0\n", + "1 Yes Alive 19.3\n", + "3 No Alive 47.1\n", + "4 Yes Alive 81.4\n", + "5 No Alive 36.8\n", + "6 No Alive 23.8\n", + "8 Yes Alive 24.8\n", + "9 Yes Alive 49.5\n", + "10 Yes Alive 30.0\n", + "12 Yes Alive 49.2\n", + "13 No Alive 58.4\n", + "15 No Alive 25.1\n", + "16 No Alive 43.5\n", + "17 No Alive 27.1\n", + "18 No Alive 58.3\n", + "19 Yes Alive 65.7\n", + "21 Yes Alive 38.3\n", + "22 No Alive 33.4\n", + "24 No Alive 18.0\n", + "25 No Alive 56.2\n", + "26 Yes Alive 59.2\n", + "27 No Alive 25.8\n", + "29 No Alive 20.2\n", + "30 Yes Alive 34.6\n", + "31 Yes Alive 51.9\n", + "32 Yes Alive 49.9\n", + "33 No Alive 19.4\n", + "34 No Alive 56.9\n", + "35 Yes Alive 46.7\n", + "36 Yes Alive 44.4\n", + "... ... ... ...\n", + "1273 Yes Alive 55.7\n", + "1274 No Alive 25.7\n", + "1275 No Alive 19.5\n", + "1276 Yes Alive 58.5\n", + "1277 No Alive 23.4\n", + "1278 Yes Alive 43.7\n", + "1279 No Alive 34.4\n", + "1281 No Alive 34.9\n", + "1282 Yes Alive 51.2\n", + "1285 Yes Alive 48.3\n", + "1286 No Alive 63.1\n", + "1287 No Alive 60.8\n", + "1289 No Alive 36.7\n", + "1290 No Alive 63.8\n", + "1292 No Alive 57.7\n", + "1293 No Alive 63.2\n", + "1294 No Alive 46.6\n", + "1296 Yes Alive 38.3\n", + "1297 Yes Alive 32.7\n", + "1298 No Alive 39.7\n", + "1301 No Alive 20.5\n", + "1302 No Alive 44.4\n", + "1303 Yes Alive 31.2\n", + "1304 Yes Alive 47.8\n", + "1305 Yes Alive 60.9\n", + "1307 Yes Alive 43.0\n", + "1308 No Alive 42.1\n", + "1309 Yes Alive 35.9\n", + "1310 No Alive 22.3\n", + "1313 No Alive 39.1\n", + "\n", + "[945 rows x 3 columns]" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.groupby('Status').get_group('Alive')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**1. Question 1**\n", + "\n", + "Calculez dans chaque groupe (fumeuses / non fumeuses) le taux de mortalité (le rapport entre le nombre de femmes décédées dans un groupe et le nombre total de femmes dans ce groupe). " + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": { + "hideOutput": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Le taux de mortalité des fumeuses est de 23.88 %\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/conda/lib/python3.6/site-packages/ipykernel_launcher.py:2: UserWarning: Boolean Series key will be reindexed to match DataFrame index.\n", + " \n" + ] + } + ], + "source": [ + "#\n", + "smoker = df.loc[df['Smoker']==\"Yes\"]\n", + "smoker = smoker[df['Status']==\"Dead\"].shape[0] / smoker.shape[0]\n", + "print(\"Le taux de mortalité des fumeuses est de {} %\".format(round(smoker*100,2)))" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Le taux de mortalité des fumeuses est de 31.42 %\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/conda/lib/python3.6/site-packages/ipykernel_launcher.py:2: UserWarning: Boolean Series key will be reindexed to match DataFrame index.\n", + " \n" + ] + } + ], + "source": [ + "#\n", + "no_smoker = df.loc[df['Smoker']==\"No\"]\n", + "no_smoker = no_smoker[df['Status']==\"Dead\"].shape[0] / no_smoker.shape[0]\n", + "print(\"Le taux de mortalité des fumeuses est de {} %\".format(round(no_smoker*100,2)))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**1. Question 1**\n", + " \n", + "Vous pourrez proposer une représentation graphique de ces données et calculer des intervalles de confiance si vous le souhaitez. En quoi ce résultat est-il surprenant ?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "ename": "TypeError", + "evalue": "can't multiply sequence by non-int of type 'float'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mm\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mm\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0mh\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mm\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0mh\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 12\u001b[0;31m \u001b[0mmean_confidence_interval\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m\u001b[0m in \u001b[0;36mmean_confidence_interval\u001b[0;34m(data, confidence)\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mmean_confidence_interval\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mconfidence\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0.95\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 6\u001b[0;31m \u001b[0ma\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m1.0\u001b[0m \u001b[0;34m*\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 7\u001b[0m \u001b[0mn\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0mm\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mse\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmean\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mscipy\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstats\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msem\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mTypeError\u001b[0m: can't multiply sequence by non-int of type 'float'" + ] + } + ], + "source": [ + "import numpy as np\n", + "import scipy.stats\n", + "\n", + "\n", + "def mean_confidence_interval(data, confidence=0.95):\n", + " a = 1.0 * np.array(data)\n", + " n = len(a)\n", + " m, se = np.mean(a), scipy.stats.sem(a)\n", + " h = se * scipy.stats.t.ppf((1 + confidence) / 2., n-1)\n", + " return m, m-h, m+h\n", + "\n", + "mean_confidence_interval(df)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**2. Question 2**\n", + "\n", + "Reprenez la question 1 (effectifs et taux de mortalité) en rajoutant une nouvelle catégorie liée à la classe d'âge. On considérera par exemple les classes suivantes : 18-34 ans, 34-54 ans, 55-64 ans, plus de 65 ans. En quoi ce résultat est-il surprenant ? Arrivez-vous à expliquer ce paradoxe ? De même, vous pourrez proposer une représentation graphique de ces données pour étayer vos explications." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**3. Question 3**\n", + "\n", + "Afin d'éviter un biais induit par des regroupements en tranches d'âges arbitraires et non régulières, il est envisageable d'essayer de réaliser une régression logistique. Si on introduit une variable Death valant 1 ou 0 pour indiquer si l'individu est décédé durant la période de 20 ans, on peut étudier le modèle Death ~ Age pour étudier la probabilité de décès en fonction de l'âge selon que l'on considère le groupe des fumeuses ou des non fumeuses. Ces régressions vous permettent-elles de conclure sur la nocivité du tabagisme ? Vous pourrez proposer une représentation graphique de ces régressions (en n'omettant pas les régions de confiance)." + ] } ], "metadata": { - "hide_code_all_hidden": true, + "celltoolbar": "Aucun(e)", + "hide_code_all_hidden": false, "kernelspec": { "display_name": "Python 3", "language": "python",