"
],
"text/plain": [
" Smoker Status Age\n",
"0 Yes Alive 21.0\n",
"1 Yes Alive 19.3\n",
"2 No Dead 57.5\n",
"3 No Alive 47.1\n",
"4 Yes Alive 81.4\n",
"5 No Alive 36.8\n",
"6 No Alive 23.8\n",
"7 Yes Dead 57.5\n",
"8 Yes Alive 24.8\n",
"9 Yes Alive 49.5\n",
"10 Yes Alive 30.0\n",
"11 No Dead 66.0\n",
"12 Yes Alive 49.2\n",
"13 No Alive 58.4\n",
"14 No Dead 60.6\n",
"15 No Alive 25.1\n",
"16 No Alive 43.5\n",
"17 No Alive 27.1\n",
"18 No Alive 58.3\n",
"19 Yes Alive 65.7\n",
"20 No Dead 73.2\n",
"21 Yes Alive 38.3\n",
"22 No Alive 33.4\n",
"23 Yes Dead 62.3\n",
"24 No Alive 18.0\n",
"25 No Alive 56.2\n",
"26 Yes Alive 59.2\n",
"27 No Alive 25.8\n",
"28 No Dead 36.9\n",
"29 No Alive 20.2\n",
"... ... ... ...\n",
"1284 Yes Dead 36.0\n",
"1285 Yes Alive 48.3\n",
"1286 No Alive 63.1\n",
"1287 No Alive 60.8\n",
"1288 Yes Dead 39.3\n",
"1289 No Alive 36.7\n",
"1290 No Alive 63.8\n",
"1291 No Dead 71.3\n",
"1292 No Alive 57.7\n",
"1293 No Alive 63.2\n",
"1294 No Alive 46.6\n",
"1295 Yes Dead 82.4\n",
"1296 Yes Alive 38.3\n",
"1297 Yes Alive 32.7\n",
"1298 No Alive 39.7\n",
"1299 Yes Dead 60.0\n",
"1300 No Dead 71.0\n",
"1301 No Alive 20.5\n",
"1302 No Alive 44.4\n",
"1303 Yes Alive 31.2\n",
"1304 Yes Alive 47.8\n",
"1305 Yes Alive 60.9\n",
"1306 No Dead 61.4\n",
"1307 Yes Alive 43.0\n",
"1308 No Alive 42.1\n",
"1309 Yes Alive 35.9\n",
"1310 No Alive 22.3\n",
"1311 Yes Dead 62.1\n",
"1312 No Dead 88.6\n",
"1313 No Alive 39.1\n",
"\n",
"[1314 rows x 3 columns]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"raw_data = pd.read_csv(data_file, encoding = 'iso-8859-1', error_bad_lines=False)\n",
"raw_data"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
"
\n",
"
\n",
"
Smoker
\n",
"
Status
\n",
"
Age
\n",
"
\n",
" \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [Smoker, Status, Age]\n",
"Index: []"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"raw_data[raw_data.isnull().any(axis=1)]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Aucune ligne vide dans le csv."
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"No 732\n",
"Yes 582\n",
"Name: Smoker, dtype: int64"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tableau = raw_data['Smoker'].value_counts()\n",
"tableau"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"On représente ici le nombre total de femmes vivantes et décédées sur la période en fonction de leur habitude de tabagisme."
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
"
\n",
"
\n",
"
Smoker
\n",
"
Status
\n",
"
Nombre
\n",
"
\n",
" \n",
" \n",
"
\n",
"
0
\n",
"
No
\n",
"
Alive
\n",
"
502
\n",
"
\n",
"
\n",
"
1
\n",
"
No
\n",
"
Dead
\n",
"
230
\n",
"
\n",
"
\n",
"
2
\n",
"
Yes
\n",
"
Alive
\n",
"
443
\n",
"
\n",
"
\n",
"
3
\n",
"
Yes
\n",
"
Dead
\n",
"
139
\n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Smoker Status Nombre\n",
"0 No Alive 502\n",
"1 No Dead 230\n",
"2 Yes Alive 443\n",
"3 Yes Dead 139"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tableau = raw_data.groupby(['Smoker', 'Status']).size().reset_index(name='Nombre')\n",
"tableau"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"On calcule maintenant le taux de mortalité des femmes fumeuses et non fumeuses."
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Le taux de mortalité des fumeuses est de 0.23883161512027493\n",
"Le taux de mortalité des non fumeuses est de 0.31420765027322406\n"
]
}
],
"source": [
"def calcul_taux_mortalite(smoker):\n",
" nb_femmes_mortes = raw_data[(raw_data['Smoker'] == smoker) & (raw_data['Status'] == 'Dead')].shape[0]\n",
" nb_femmes = (raw_data['Smoker'] == smoker).sum()\n",
" return nb_femmes_mortes / nb_femmes\n",
"\n",
"taux_mortalite_fumeuses = calcul_taux_mortalite('Yes')\n",
"taux_mortalite_non_fumeuses = calcul_taux_mortalite('No')\n",
"\n",
"print(f\"Le taux de mortalité des fumeuses est de {taux_mortalite_fumeuses}\")\n",
"print(f\"Le taux de mortalité des non fumeuses est de {taux_mortalite_non_fumeuses}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"On calcule ensuite l'intervale de confiance de ces taux de mortalité."
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"L'intervale de confiance du taux de mortalité des fumeuses est (0.20419201440918022, 0.27347121583136963)\n",
"L'intervale de confiance du taux de mortalité des non fumeuses est (0.28057994394817404, 0.3478353565982741)\n"
]
}
],
"source": [
"import statsmodels.api as sm\n",
"\n",
"def intervale_confiance(smoker):\n",
" n = (raw_data['Smoker'] == smoker).sum()\n",
" p = calcul_taux_mortalite(smoker)\n",
" # Intervalle de confiance à 95%\n",
" intevaleC = sm.stats.proportion_confint(count=int(n * p), nobs=n, alpha=0.05, method='normal')\n",
" return intevaleC\n",
"\n",
"intervale_fumeuses = intervale_confiance('Yes')\n",
"intervale_non_fumeuses = intervale_confiance('No')\n",
"\n",
"print(f\"L'intervale de confiance du taux de mortalité des fumeuses est {intervale_fumeuses}\")\n",
"print(f\"L'intervale de confiance du taux de mortalité des non fumeuses est {intervale_non_fumeuses}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"On fait ensuite une répresentation graphique de nos données."
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"
"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"import matplotlib.pyplot as plt\n",
" \n",
"status_fumeur = ['No', 'Yes'] \n",
"valeur_mortalite = [taux_mortalite_non_fumeuses, taux_mortalite_fumeuses]\n",
"min_intervaleC = [intervale_non_fumeuses[0], intervale_fumeuses[0]]\n",
"max_intervaleC = [intervale_non_fumeuses[1], intervale_fumeuses[1]]\n",
"\n",
"plt.bar(status_fumeur, valeur_mortalite, yerr=[min_intervaleC, max_intervaleC], capsize=5)\n",
"plt.title('Taux de mortalité par statut de fumeur')\n",
"plt.ylabel('Taux de mortalité')\n",
"plt.xlabel('Statut de fumeur')\n",
"plt.ylim(0, 1)\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Le résultat est étrange car le taux de mortalité des femmes fumeuses est supérieur à celui des femmes non fumeuses, ce qui est incohérent par rapport à nos connaissances sur les conséquences de l'usage du tabac. Nous avons sûrement oublié de prendre en compte une donnée essentielle."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Partie 2"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"On reprend donc l'analyse précédente en prenant en compte des périodes d'âges : \n",
" *18-34 ans, 35-54 ans, 55-64 ans, plus de 65 ans*"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
"
\n",
"
\n",
"
Smoker
\n",
"
Status
\n",
"
Nombre
\n",
"
\n",
"
\n",
"
Age Groupe
\n",
"
\n",
"
\n",
"
\n",
"
\n",
" \n",
" \n",
"
\n",
"
18-34
\n",
"
No
\n",
"
Alive
\n",
"
213
\n",
"
\n",
"
\n",
"
18-34
\n",
"
No
\n",
"
Dead
\n",
"
6
\n",
"
\n",
"
\n",
"
18-34
\n",
"
Yes
\n",
"
Alive
\n",
"
174
\n",
"
\n",
"
\n",
"
18-34
\n",
"
Yes
\n",
"
Dead
\n",
"
5
\n",
"
\n",
"
\n",
"
35-54
\n",
"
No
\n",
"
Alive
\n",
"
180
\n",
"
\n",
"
\n",
"
35-54
\n",
"
No
\n",
"
Dead
\n",
"
19
\n",
"
\n",
"
\n",
"
35-54
\n",
"
Yes
\n",
"
Alive
\n",
"
198
\n",
"
\n",
"
\n",
"
35-54
\n",
"
Yes
\n",
"
Dead
\n",
"
41
\n",
"
\n",
"
\n",
"
55-64
\n",
"
No
\n",
"
Alive
\n",
"
80
\n",
"
\n",
"
\n",
"
55-64
\n",
"
No
\n",
"
Dead
\n",
"
39
\n",
"
\n",
"
\n",
"
55-64
\n",
"
Yes
\n",
"
Alive
\n",
"
64
\n",
"
\n",
"
\n",
"
55-64
\n",
"
Yes
\n",
"
Dead
\n",
"
51
\n",
"
\n",
"
\n",
"
65+
\n",
"
No
\n",
"
Alive
\n",
"
29
\n",
"
\n",
"
\n",
"
65+
\n",
"
No
\n",
"
Dead
\n",
"
166
\n",
"
\n",
"
\n",
"
65+
\n",
"
Yes
\n",
"
Alive
\n",
"
7
\n",
"
\n",
"
\n",
"
65+
\n",
"
Yes
\n",
"
Dead
\n",
"
42
\n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Smoker Status Nombre\n",
"Age Groupe \n",
"18-34 No Alive 213\n",
"18-34 No Dead 6\n",
"18-34 Yes Alive 174\n",
"18-34 Yes Dead 5\n",
"35-54 No Alive 180\n",
"35-54 No Dead 19\n",
"35-54 Yes Alive 198\n",
"35-54 Yes Dead 41\n",
"55-64 No Alive 80\n",
"55-64 No Dead 39\n",
"55-64 Yes Alive 64\n",
"55-64 Yes Dead 51\n",
"65+ No Alive 29\n",
"65+ No Dead 166\n",
"65+ Yes Alive 7\n",
"65+ Yes Dead 42"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bins = [18, 34, 54, 64, 100]\n",
"labels = ['18-34', '35-54', '55-64', '65+']\n",
"raw_data['Age Groupe'] = pd.cut(raw_data['Age'], bins=bins, labels=labels, right=False)\n",
"\n",
"tableau = raw_data.groupby(['Smoker', 'Status', 'Age Groupe']).size().reset_index(name='Nombre')\n",
"tableau_trie = tableau.set_index('Age Groupe').sort_index()\n",
"tableau_trie"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"On calcule le taux de mortalité en fonction de ces périodes d'âges."
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Le taux de mortalité des fumeuses de 18 à 34 ans est de 0.00859106529209622\n",
"Le taux de mortalité des non fumeuses de 18 à 34 ans est de 0.00819672131147541\n",
"\n",
"Le taux de mortalité des fumeuses de 35 à 54 ans est de 0.06701030927835051\n",
"Le taux de mortalité des non fumeuses de 35 à 54 ans est de 0.025956284153005466\n",
"\n",
"Le taux de mortalité des fumeuses de 55 à 64 ans est de 0.08762886597938144\n",
"Le taux de mortalité des non fumeuses de 55 à 64 ans est de 0.05327868852459016\n",
"\n",
"Le taux de mortalité des fumeuses de 65 ans et plus est de 0.07216494845360824\n",
"Le taux de mortalité des non fumeuses de 65 ans et plus est de 0.22540983606557377\n",
"\n"
]
}
],
"source": [
"def calcul_taux_mortalite_periode(smoker, age_min, age_max=200):\n",
" nb_femmes_mortes = raw_data[(raw_data['Smoker'] == smoker) & (raw_data['Status'] == 'Dead') & (raw_data['Age']>=age_min) & (raw_data['Age']|z| [0.025 0.975]\n",
"------------------------------------------------------------------------------\n",
"const -6.7955 0.479 -14.174 0.000 -7.735 -5.856\n",
"Age 0.1073 0.008 13.742 0.000 0.092 0.123\n",
"==============================================================================\n",
" Logit Regression Results \n",
"==============================================================================\n",
"Dep. Variable: Death No. Observations: 582\n",
"Model: Logit Df Residuals: 580\n",
"Method: MLE Df Model: 1\n",
"Date: Mon, 11 Nov 2024 Pseudo R-squ.: 0.2492\n",
"Time: 17:09:20 Log-Likelihood: -240.21\n",
"converged: True LL-Null: -319.94\n",
" LLR p-value: 1.477e-36\n",
"==============================================================================\n",
" coef std err z P>|z| [0.025 0.975]\n",
"------------------------------------------------------------------------------\n",
"const -5.5081 0.466 -11.814 0.000 -6.422 -4.594\n",
"Age 0.0890 0.009 10.203 0.000 0.072 0.106\n",
"==============================================================================\n"
]
}
],
"source": [
"# Modèle pour les non-fumeurs \n",
"non_fumeurs = raw_data[raw_data['Smoker'] == 'No']\n",
"X_ns = sm.add_constant(non_fumeurs['Age']) \n",
"y_ns = non_fumeurs['Death']\n",
"model_ns = sm.Logit(y_ns, X_ns).fit()\n",
"\n",
"# Modèle pour les fumeurs \n",
"fumeurs = raw_data[raw_data['Smoker'] == 'Yes']\n",
"X_s = sm.add_constant(fumeurs['Age'])\n",
"y_s = fumeurs['Death']\n",
"model_s = sm.Logit(y_s, X_s).fit()\n",
"\n",
"print(model_ns.summary())\n",
"print(model_s.summary())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"On modélise ensuite la régression logique à l'aide d'un graphique."
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
""
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"import numpy as np \n",
"\n",
"tranche_age = np.linspace(18, 85, 100)\n",
" \n",
"pred_ns = model_ns.predict(sm.add_constant(age_range))\n",
"pred_s = model_s.predict(sm.add_constant(age_range))\n",
"\n",
"# Calculer les intervalles de confiance \n",
"int_conf_ns = intervale_confiance('No')\n",
"int_conf_s = intervale_confiance('Yes')\n",
"\n",
"# Prédire les probabilités et les intervalles de confiance pour les non-fumeurs \n",
"lower_bound_ns = model_ns.predict(sm.add_constant(tranche_age)) - 1.96 * np.sqrt(pred_ns * (1 - pred_ns) / len(non_fumeurs))\n",
"upper_bound_ns = model_ns.predict(sm.add_constant(tranche_age)) + 1.96 * np.sqrt(pred_ns * (1 - pred_ns) / len(non_fumeurs))\n",
"\n",
"# Prédire les probabilités et les intervalles de confiance pour les fumeurs \n",
"lower_bound_s = model_s.predict(sm.add_constant(tranche_age)) - 1.96 * np.sqrt(pred_s * (1 - pred_s) / len(fumeurs))\n",
"upper_bound_s = model_s.predict(sm.add_constant(tranche_age)) + 1.96 * np.sqrt(pred_s * (1 - pred_s) / len(fumeurs))\n",
"\n",
"plt.figure(figsize=(10, 6))\n",
"plt.plot(tranche_age, pred_ns, label='Probabilité de décès (non-fumeurs)', color='blue')\n",
"plt.fill_between(age_range, lower_bound_ns, upper_bound_ns, color='blue', alpha=0.2)\n",
"plt.plot(tranche_age, pred_s, label='Probabilité de décès (fumeurs)', color='red')\n",
"plt.fill_between(age_range, lower_bound_s, upper_bound_s, color='red', alpha=0.2)\n",
"plt.title('Régression logistique : Probabilité de décès en fonction de l\\'âge')\n",
"plt.xlabel('Âge')\n",
"plt.ylabel('Probabilité de décès')\n",
"plt.legend()\n",
"plt.ylim(0, 1)\n",
"plt.grid()\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Ces régressions vous permettent de conclure que la nocivité du tabagisme influe particulièrement sur le taux de mortalité des femmes de 18 à 65 ans. Les femmes fumeuses ont ainsi plus de chance de mourir jeune que les femmes non fumeuses. Cependant, ce phénomène s'inverse pour les femmes de plus de 65 ans, celui pourrait être dû à l'infériorité numérique des femmes fumeuses de plus de 65 ans par rapport au femmes fumeuses."
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}