diff --git a/module3/exo3/Simson's Paradox Analysis_AlessandroBroli.ipynb b/module3/exo3/Simson's Paradox Analysis_AlessandroBroli.ipynb index f2ebabed7596de25f20e4ecde33b4a65ee04be7f..3b4511e579b280644eb14d88acb190a5217b15b2 100644 --- a/module3/exo3/Simson's Paradox Analysis_AlessandroBroli.ipynb +++ b/module3/exo3/Simson's Paradox Analysis_AlessandroBroli.ipynb @@ -952,7 +952,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -1004,7 +1004,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -1056,7 +1056,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -1108,7 +1108,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -1180,7 +1180,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ @@ -1200,7 +1200,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ @@ -1220,7 +1220,7 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -1249,7 +1249,7 @@ "================================================================\n", "Model: Logit Pseudo R-squared: -0.133 \n", "Dependent Variable: Death AIC: 1770.6050\n", - "Date: 2020-09-16 18:51 BIC: 1775.7858\n", + "Date: 2020-09-16 23:02 BIC: 1775.7858\n", "No. Observations: 1314 Log-Likelihood: -884.30 \n", "Df Model: 0 LL-Null: -780.16 \n", "Df Residuals: 1313 LLR p-value: nan \n", @@ -1297,159 +1297,6 @@ "print(result.summary2())" ] }, - { - "cell_type": "code", - "execution_count": 55, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
SmokerStatusAgeAge_ClassDeath
0YesAlive21.0young0
1YesAlive19.3young0
4YesAlive81.4senior0
7YesDead57.5middle age1
8YesAlive24.8young0
..................
1304YesAlive47.8adult0
1305YesAlive60.9middle age0
1307YesAlive43.0adult0
1309YesAlive35.9adult0
1311YesDead62.1middle age1
\n", - "

582 rows × 5 columns

\n", - "
" - ], - "text/plain": [ - " Smoker Status Age Age_Class Death\n", - "0 Yes Alive 21.0 young 0\n", - "1 Yes Alive 19.3 young 0\n", - "4 Yes Alive 81.4 senior 0\n", - "7 Yes Dead 57.5 middle age 1\n", - "8 Yes Alive 24.8 young 0\n", - "... ... ... ... ... ...\n", - "1304 Yes Alive 47.8 adult 0\n", - "1305 Yes Alive 60.9 middle age 0\n", - "1307 Yes Alive 43.0 adult 0\n", - "1309 Yes Alive 35.9 adult 0\n", - "1311 Yes Dead 62.1 middle age 1\n", - "\n", - "[582 rows x 5 columns]" - ] - }, - "execution_count": 55, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "data_sm" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -1459,7 +1306,7 @@ }, { "cell_type": "code", - "execution_count": 56, + "execution_count": 21, "metadata": {}, "outputs": [ { @@ -1473,7 +1320,7 @@ "===============================================================\n", "Model: Logit Pseudo R-squared: -0.152 \n", "Dependent Variable: Death AIC: 738.9078\n", - "Date: 2020-09-16 18:58 BIC: 743.2743\n", + "Date: 2020-09-16 23:02 BIC: 743.2743\n", "No. Observations: 582 Log-Likelihood: -368.45 \n", "Df Model: 0 LL-Null: -319.94 \n", "Df Residuals: 581 LLR p-value: nan \n", @@ -1498,7 +1345,7 @@ }, { "cell_type": "code", - "execution_count": 57, + "execution_count": 22, "metadata": {}, "outputs": [ { @@ -1512,7 +1359,7 @@ "================================================================\n", "Model: Logit Pseudo R-squared: -0.105 \n", "Dependent Variable: Death AIC: 1009.0908\n", - "Date: 2020-09-16 18:59 BIC: 1013.6866\n", + "Date: 2020-09-16 23:02 BIC: 1013.6866\n", "No. Observations: 732 Log-Likelihood: -503.55 \n", "Df Model: 0 LL-Null: -455.62 \n", "Df Residuals: 731 LLR p-value: nan \n", @@ -1545,10 +1392,25 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 23, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.metrics import classification_report, confusion_matrix" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Conclusion\n", + "\n", + "In conclusion we processed data from a group of people that contained smoking status, living status at the time of the second point and age at the second point of study. \n", + "We have found an inverse association between smoking status and mortality at first glance. The fact of being a smoker was associated with a smaller mortality rate. This could make us believe that smoking protects us from death. However, to evaluate the true and direct effect of smoking on health status we should take into consideration the variable age because age affects your chances of death and smoking affects the age at which people where taken into consideration. Indeed we could end up with a population where elderly people are smokers and die less frequently because all the people susceptible to adverse effect of smoking have already died (for exemple of lung cancer). This would constitute an important bias.\n", + "The bias \n", + "To sum up the Simpson's paradox we analyzed comes from the fact that there is an association between two variables X (Smoker status) and Y (mortality rate), that is inverted when we condition on a third parameter (Age class). This is very clear if we took into consideration the causal DAG connecting the three variables. " + ] } ], "metadata": {