diff --git a/module3/exo3/exercice.ipynb b/module3/exo3/exercice.ipynb index c3ec716f5b778185f257edb4637340f7ad9ad4fd..df9210287dd4f672674d6bc02ccf12c8780a0e89 100644 --- a/module3/exo3/exercice.ipynb +++ b/module3/exo3/exercice.ipynb @@ -20,7 +20,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -36,68 +36,178 @@ "## I - Chargement des données et controle si leur contenu est valide + extraction de quelques informations générales :" ] }, + { + "cell_type": "raw", + "metadata": {}, + "source": [ + "Remarque préliminaire : \n", + "* Pour se protéger contre une éventuelle disparition ou modification du serveur ou des fichiers qu'il abrite, \n", + "faire une copie locale de ce jeux de données qui sera préservée avec l'analyse qui va en etre faite. \n", + "* On télécharge les données depuis le WEB seulement si la copie locale n'existe pas." + ] + }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# Sur le WEB : \n", + "# * lien permanent du fichier de données :\n", + "#data_url=\"https://gitlab.inria.fr/learninglab/mooc-rr/mooc-rr-ressources/-/blob/e06310de87a61ee756949895d588e65334b0bfc9/module3/Practical_session/Subject6_smoking.csv\"\n", + "# * lien actualisé du fichier de données :\n", + "data_url=\"https://gitlab.inria.fr/learninglab/mooc-rr/mooc-rr-ressources/raw/master/module3/Practical_session/Subject6_smoking.csv\"\n", + "\n", + "# Chemin vers une copie locale de ce fichier : \n", + "LocalInputData=\"\" # \"C:/Users/hpascalj/__DataSets/module3_Practical_session_Subject6_smoking.csv\"" + ] + }, + { + "cell_type": "code", + "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "stringC1 = https://gitlab.inria.fr/learninglab/mooc-rr/mooc-rr-ressources/blob/master/module3/Practical_session/Subject6_smoking.csv\n" - ] - }, - { - "ename": "ParserError", - "evalue": "Error tokenizing data. C error: Expected 1 fields in line 29, saw 25\n", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mParserError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[0mprint\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;34m\"stringC1 = \"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstringC1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 11\u001b[0;31m \u001b[0mOriginalInputData\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstringC1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/pandas/io/parsers.py\u001b[0m in \u001b[0;36mparser_f\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skipfooter, skip_footer, doublequote, delim_whitespace, as_recarray, compact_ints, use_unsigned, low_memory, buffer_lines, memory_map, float_precision)\u001b[0m\n\u001b[1;32m 707\u001b[0m skip_blank_lines=skip_blank_lines)\n\u001b[1;32m 708\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 709\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0m_read\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 710\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 711\u001b[0m \u001b[0mparser_f\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__name__\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/pandas/io/parsers.py\u001b[0m in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m 453\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 454\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 455\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mparser\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnrows\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 456\u001b[0m \u001b[0;32mfinally\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 457\u001b[0m \u001b[0mparser\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mclose\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/pandas/io/parsers.py\u001b[0m in \u001b[0;36mread\u001b[0;34m(self, nrows)\u001b[0m\n\u001b[1;32m 1067\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'skipfooter not supported for iteration'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1068\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1069\u001b[0;31m \u001b[0mret\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnrows\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1070\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1071\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moptions\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'as_recarray'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/pandas/io/parsers.py\u001b[0m in \u001b[0;36mread\u001b[0;34m(self, nrows)\u001b[0m\n\u001b[1;32m 1837\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnrows\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1838\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1839\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_reader\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnrows\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1840\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mStopIteration\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1841\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_first_chunk\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader.read\u001b[0;34m()\u001b[0m\n", - "\u001b[0;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._read_low_memory\u001b[0;34m()\u001b[0m\n", - "\u001b[0;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._read_rows\u001b[0;34m()\u001b[0m\n", - "\u001b[0;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._tokenize_rows\u001b[0;34m()\u001b[0m\n", - "\u001b[0;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.raise_parser_error\u001b[0;34m()\u001b[0m\n", - "\u001b[0;31mParserError\u001b[0m: Error tokenizing data. C error: Expected 1 fields in line 29, saw 25\n" + "the specified local data-file is not available; by default, select the data-file available at the prescribed url !\n" ] } ], "source": [ "# Chargement du jeu de données originales :\n", - "if False :\n", - " # Si fichier de données copié localement :\n", - " pathToFile=\"../__DataSets/\"\n", - " FileName=\"module3_Practical_session_Subject6_smoking.csv\"\n", - " stringC1 = pathToFile+FileName\n", + "# -----------------------------------------\n", + "import os\n", + "import urllib.request\n", + "if not os.path.exists(LocalInputData):\n", + " print (\"the specified local data-file is not available; by default, select the data-file available at the prescribed url !\")\n", + " urllib.request.urlretrieve(data_url, LocalInputData)\n", + " OriginalInputData=pd.read_csv(data_url) # ainsi possible avec une version de pandas >= 0.19.2\n", "else :\n", - " stringC1 = \"https://gitlab.inria.fr/learninglab/mooc-rr/mooc-rr-ressources/blob/master/module3/Practical_session/Subject6_smoking.csv\"\n", - "\n", - "print (\"stringC1 = \", stringC1)\n", - "OriginalInputData = pd.read_csv(stringC1)" + " print (\"usage of the LocalInputData = \", LocalInputData)" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 5, "metadata": {}, "outputs": [ { - "ename": "NameError", - "evalue": "name 'OriginalInputData' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mprint\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;34m\"le nb d'enregistrements = \"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mOriginalInputData\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\" = (nb_lignes='features-values' , nb_colonnes='features')\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0;31m#\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;31m# InputData_index = OriginalInputData.index\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;31m# InputData_columns = OriginalInputData.columns\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;31m# print (\"InputData : indexe des colonnes = \",InputData_columns)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mNameError\u001b[0m: name 'OriginalInputData' is not defined" + "name": "stdout", + "output_type": "stream", + "text": [ + "le nb d'enregistrements = (1314, 3) = (nb_lignes='features-values' , nb_colonnes='features')\n" ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SmokerStatusAge
0YesAlive21.0
1YesAlive19.3
2NoDead57.5
3NoAlive47.1
4YesAlive81.4
5NoAlive36.8
6NoAlive23.8
7YesDead57.5
8YesAlive24.8
9YesAlive49.5
\n", + "
" + ], + "text/plain": [ + " Smoker Status Age\n", + "0 Yes Alive 21.0\n", + "1 Yes Alive 19.3\n", + "2 No Dead 57.5\n", + "3 No Alive 47.1\n", + "4 Yes Alive 81.4\n", + "5 No Alive 36.8\n", + "6 No Alive 23.8\n", + "7 Yes Dead 57.5\n", + "8 Yes Alive 24.8\n", + "9 Yes Alive 49.5" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ @@ -120,19 +230,49 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 6, "metadata": {}, "outputs": [ { - "ename": "NameError", - "evalue": "name 'OriginalInputData' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mOriginalInputData\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mOriginalInputData\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0misnull\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0many\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;31mNameError\u001b[0m: name 'OriginalInputData' is not defined" - ] + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SmokerStatusAge
\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [Smoker, Status, Age]\n", + "Index: []" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ @@ -148,7 +288,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -168,9 +308,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Age min = 18\n", + " Age max = 89\n", + " Age moyen = 47\n", + " Age median = 44\n", + " Age ecartType = 19.15\n" + ] + } + ], "source": [ "Age = OriginalInputData['Age']\n", "# Soit utiliser la fonction suivante objet.describe() \n", @@ -193,9 +345,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "count 1314\n", + "unique 2\n", + "top No\n", + "freq 732\n", + "Name: Smoker, dtype: object" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Vérifions le contenu de la sous-liste de données qui contient 2 données binaires 'Yes' ou 'No' :\n", "Fumeuse_ou_non = OriginalInputData['Smoker']\n", @@ -212,9 +379,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "count 1314\n", + "unique 2\n", + "top Alive\n", + "freq 945\n", + "Name: Status, dtype: object" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Vérifions le contenu de la sous-liste de données qui contient 2 données binaires 'Dead' ou 'Alive' :\n", "Vivante_ou_Morte = OriginalInputData['Status']\n", @@ -238,9 +420,119 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SmokerStatusAge
0YesAlive21.0
1YesAlive19.3
4YesAlive81.4
7YesDead57.5
8YesAlive24.8
9YesAlive49.5
10YesAlive30.0
12YesAlive49.2
19YesAlive65.7
21YesAlive38.3
\n", + "
" + ], + "text/plain": [ + " Smoker Status Age\n", + "0 Yes Alive 21.0\n", + "1 Yes Alive 19.3\n", + "4 Yes Alive 81.4\n", + "7 Yes Dead 57.5\n", + "8 Yes Alive 24.8\n", + "9 Yes Alive 49.5\n", + "10 Yes Alive 30.0\n", + "12 Yes Alive 49.2\n", + "19 Yes Alive 65.7\n", + "21 Yes Alive 38.3" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# selection des lignes du DataFrame Fumeuse_ou_non en fonction de la valeur 'Yes' dans la colonne descriptive 'Smoker':\n", "Fumeuses = OriginalInputData.loc[OriginalInputData['Smoker'] == 'Yes']\n", @@ -249,9 +541,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Nb Fumeuses Mortes= 139 ; Nb Fumeuses Vivantes= 443\n" + ] + } + ], "source": [ "Nb_Fumeuses = Fumeuses.shape[0] # donne le nombre de lignes dans Fumeuses\n", "# print (\"Nb_Fumeuses=\",Nb_Fumeuses)\n", @@ -271,9 +571,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Toutes FumeusesConfonfues : TxMortalite = 23.88 \n" + ] + } + ], "source": [ "# Le taux de mortalité dans le groupe des femmes fumeuses toutes ensemble considérées vaut :\n", "ToutesFumeusesConfonfues_TxMortalite = (Nb_FumeusesMortes)/Nb_Fumeuses * 100\n", @@ -282,9 +590,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Nb NonFumeuses Mortes= 230 ; Nb NonFumeuses Vivantes= 502\n" + ] + } + ], "source": [ "NonFumeuses = OriginalInputData.loc[OriginalInputData['Smoker'] == 'No']\n", "Nb_NonFumeuses = NonFumeuses.shape[0] # donne le nombre de lignes dans NonFumeuses\n", @@ -304,9 +620,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Toutes Non-Fumeuses Confonfues : TxMortalite = 31.42 \n" + ] + } + ], "source": [ "# Le taux de mortalité dans le groupe des femmes Non fumeuses toutes ensemble considérées vaut :\n", "ToutesNonFumeusesConfonfues_TxMortalite = (Nb_NonFumeusesMortes)/Nb_NonFumeuses * 100\n", @@ -322,9 +646,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Groupes SANS distinction d'age Nb_Vivantes Nb_Mortes Mortalité en %\n", + "0 Femmes fumeuses 443 139 23.883162\n", + "1 Femmes non fumeuses 502 230 31.420765\n" + ] + } + ], "source": [ "table0 = {\"Groupes SANS distinction d'age\": ['Femmes fumeuses', 'Femmes non fumeuses'],\n", " 'Nb_Vivantes': [Nb_FumeusesVivantes, Nb_NonFumeusesVivantes],\n", @@ -337,9 +671,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXQAAAEICAYAAABPgw/pAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvIxREBQAAE/FJREFUeJzt3X+UXGV9x/H3lyRiCihysmCK1lVKK4gScI+tB6tRWoq2p0BVlAFNj2KslbZIf1lqC9ieU6qo1fqjDUgJLaPQKhWQKkiLVEV0g5QfohY1ChKSRWyBSpGEb/+4z5phM5uZ3ZnNZp+8X+fs2Xuf+9x7v3f37mefeWZmNzITSdLCt9t8FyBJGg4DXZIqYaBLUiUMdEmqhIEuSZUw0CWpEga6qhERz4yIzfNdxzBExNERcUfH+jcj4vl97LcoIv41Il47txVqZ2Sga8Yi4sGOj0cj4qGO9RPnu76dUUScHRHnzXb/zDwgM6/v41jvBP4lMy+c7bm0cC2e7wK08GTmnpPLEbEeODkzPzN/Fe3cImKH/Zxl5mk76lza+ThC19BFxBERcUNE/E9E3B0R75kMtW7TIhHxxYg4qSz/fURc1LHtvRHxyWnOs7hs/36ZnvilKdv3iYgLI+KeiLgzIs6IiK73fBn1XhQRF5dHGjdFxNPLPvdGxPqIeHFH/5+KiCsj4r6I+EZErJpyrHY51gPAq4HTgFXl2F8q/d4YEV+LiAci4o6IeN12vqb3RMQLIuLYaY7V97WqXo7QNRceAU4BbgRGgU8DXwf+to99fwe4OSJeDUwAJwDPmabvKcBLgGeXc146ZftFwB3AM4AnAlcC64G10xzvOOBXgBPLvv8GfAB4MvAm4IPAQaXvPwFfKPs8G7gqIu7IzM+X7S8v204AdgcOAZZl5skd59sAvLTUdCRweUTckJm3TVMfmfkvEfHuLsea6bWqQv4G19Bl5pcy88uZuSUzvwmcB7yoz30fAF4LvJ8mjH4zM++ZpvvxwLsy8+7MnADeMbkhIp4GvBA4LTN/mJkbgPfRjJanc01m/ntmbgb+GXhCOf5m4KPAMyNiaUQcCBwKnJ6ZD2fmeKn1NR3H+mxmXpmZj2bmQ9Nc62WZ+e1sfAb4LPCC7X+FtjXLa1WFHKFr6CLiYOBdwOHAUpr77PPb3emxPkczel3KtqPuTj8J3Nmx/p2O5acBjwcmImKybTeaUex0NnYsPwRM5Na/XjcZynuU805MCerv0IyyJ3XW1VVE/BrwNuCnS20/AfxHr/26mM21qkKO0DUXzqWZbjkgM58AvB2YTJr/BRZFxO4d/Z88Zf/TaKZQ7gdO3c55NgBP7Vj/qY7lO4EHgSdl5t7l4wmZefiMr2ZbdwMjEbF0yrm/17E+9c+YPmY9Ivagmbb5c2DfzNybZoon6G3qsefyWrWAGOiaC3sB/5OZD0bEs4A3dGy7m2Zu/MTymunfAvaf3BgRh9CMWk8qH39WRvzdXAK8JSKWR8Qy4A8nN2Tmt4EvAu+IiL0iYreIODAiZjyl0cUdwM3AX0TE7hFxOLCKZh57OhuBp8fWIfRSYAmwCXi0jNZX9nn+xxxrjq9VC4iBrrnwFuDkiHiQ5knFiyc3ZOYW4GTgDOBemhH2OoCIeBzwj8BZmfnVzPwqzej+HyJiSZfzvJ9miuI24AaagO90ArA38DXgvlLHfoNeXJmGOR44GLinHPcPMnN70yUfpZlSuS8ivpCZ9wK/D1wOfB84luaJzH485lilbU6uVQtL+A8uJKkOjtAlqRIGuiRVwkCXpEoY6JJUiR36xqJly5bl6OjojjylJC1469atuzczR3r126GBPjo6yvj4+I48pSQteBHxnd69nHKRpGoY6JJUCQNdkiphoEtSJQx0SaqEgS5JlTDQJakSBrokVcJAl6RK+D9FpSGJs/r573HaVeUZc/+/JxyhS1IlDHRJqoSBLkmVMNAlqRIGuiRVoverXNrxeOA6YPfS/59p5Rm0Yx/gYmAUWA8cTyt/MGeVSpK2q58R+sPAS2jlocAK4Gja8fPAW4FraOWBwDVlXZI0T3qP0FuZwINlbUn5SOAYYGVpXwtcC/zRsAuUJPWnvzn0diyiHTcBm4CraeUNwH60cgNA+bxvt10jYnVEjEfE+MTExHCqliRto79Ab+UWWrkCeArwPNpxSL8nyMw1mTmWmWMjIz3/x6kkaZZm9iqXVv43zdTK0cBG2rEcoHzeNOTaJEkz0DvQ2zFCO/Yuy0uBXwS+BlwGrCq9VgGfmJsSJUn96OePcy0H1tKORTS/AC6hlVfQjuuBS2jH64HvAq+cwzolST308yqXm4HDurR/Hzhy+CVJkmbDd4pKUiUMdEmqhIEuSZUw0CWpEga6JFXCQJekShjoklQJA12SKmGgS1IlDHRJqoSBLkmVMNAlqRIGuiRVwkCXpEoY6JJUCQNdkiphoEtSJQx0SaqEgS5JlTDQJakSBrokVcJAl6RKGOiSVAkDXZIqsbhnj3Y8FbgQeDLwKLCGVr6XdpwJvAGYKD1Pp5VXzlGdkqQeegc6bAZ+j1beSDv2AtbRjqvLtvfQynPmrjxJUr96B3orNwAbyvIDtON2YP+5LUuSNFMzm0NvxyhwGHBDaTmFdtxMO86nHU/qtktErI6I8YgYn5iY6NZFkjQE/Qd6O/YEPgacSivvBz4EHACsoBnBv6vbbpm5JjPHMnNsZGRk8IolSV31M4cO7VhCE+YX0cqPA9DKjR3bzwWuGH55kqR+9R6htyOADwO308p3d7Qv7+h1HHDrsIuTJPWvnxH6EcBrgFtox02l7XTgBNqxAkhgPfDGOalQktSXfl7l8jkgumzxNeeStBPxnaKSVIn+nhTdGUS3BwkSkDnfFUg7BUfoklQJA12SKmGgS1IlDHRJqoSBLkmVMNAlqRIGuiRVwkCXpEoY6JJUCQNdkiphoEtSJQx0SaqEgS5JlTDQJakSBrokVcJAl6RKGOiSVAkDXZIqYaBLUiUMdEmqhIEuSZUw0CWpEot79mjHU4ELgScDjwJraOV7acc+wMXAKLAeOJ5W/mDOKpUkbVc/I/TNwO/RyoOAnwfeTDsOBt4KXEMrDwSuKeuSpHnSO9BbuYFW3liWHwBuB/YHjgHWll5rgWPnpkRJUj9mNofejlHgMOAGYD9auQGgfN632y4RsToixiNifGJiYpBaJUnb0X+gt2NP4GPAqbTy/n53y8w1mTmWmWMjIyOzKFGS1I/+Ar0dS2jC/CJa+fHSupF2LC/blwOb5qJASVJ/egd6OwL4MHA7rXx3x5bLgFVleRXwiaFXJ0nqW++XLcIRwGuAW2jHTaXtdOBs4BLa8Xrgu8Ar56ZESVI/egd6Kz8HxDRbjxxqNZKkWfOdopJUCQNdkiphoEtSJQx0SaqEgS5JlTDQJakSBrokVcJAl6RKGOiSVAkDXZIqYaBLUiUMdEmqhIEuSZUw0CWpEga6JFXCQJekShjoklQJA12SKmGgS1IlDHRJqoSBLkmVMNAlqRIGuiRVwkCXpEos7tmjHecDvwpsopWHlLYzgTcAE6XX6bTyyrkpUZLUj96BDhcA7wcunNL+Hlp5ztArkiTNSu8pl1ZeB9w396VIkgYxyBz6KbTjZtpxPu140nSdImJ1RIxHxPjExMR03SRJA5ptoH8IOABYAWwA3jVdx8xck5ljmTk2MjIyy9NJknrpZw59W63c+OPldpwLXDGkeiRJszS7EXo7lnesHQfcOpRqJEmz1s/LFj8CrASW0Y67gDOAlbRjBZDAeuCNc1eiJKkfvQO9lSd0af3w8EuRJA3Cd4pKUiUMdEmqhIEuSZUw0CWpEga6JFXCQJekShjoklQJA12SKmGgS1IlDHRJqoSBLkmVMNAlqRIGuiRVwkCXpEoY6JJUCQNdkiphoEtSJQx0SaqEgS5JlTDQJakSBrokVcJAl6RKGOiSVAkDXZIqsbhnj3acD/wqsIlWHlLa9gEuBkaB9cDxtPIHc1WkJKm3fkboFwBHT2l7K3ANrTwQuKasS5LmUe9Ab+V1wH1TWo8B1pbltcCxwy1LkjRTs51D349WbgAon/edrmNErI6I8YgYn5iYmOXpJEm9zPmTopm5JjPHMnNsZGRkrk8nSbus2Qb6RtqxHKB83jS0iiRJszLbQL8MWFWWVwGfGE45kqTZ6udlix8BVgLLaMddwBnA2cAltOP1wHeBV85hjZKkPvQO9FaeMM2WI4dbiiRpEL5TVJIqYaBLUiUMdEmqhIEuSZUw0CWpEga6JFXCQJekShjoklQJA12SKmGgS1IlDHRJqoSBLkmVMNAlqRIGuiRVwkCXpEoY6JJUCQNdkiphoEtSJQx0SaqEgS5JlTDQJakSBrokVcJAl6RKGOiSVInFA+3djvXAA8AWYDOtHBtCTZKkWRgs0BsvppX3DuE4kqQBOOUiSZUYNNATuIp2rKMdq7t1iIjVETEeEeMTExMDnk6SNJ1BA/0IWnk48FLgzbTjhVM7ZOaazBzLzLGRkZEBTydJms5ggd7Ku8vnTcClwPMGL0mSNBuzD/R27EE79vrxMhwF3DqcsiRJMzXIq1z2Ay6lHZPHadPKTw2lKknSjM0+0Fv5LeDQ4ZUiSRqEL1uUpEoY6JJUCQNdkiphoEtSJQx0SaqEgS5JlTDQJakSBrokVcJAl6RKGOiSVAkDXZIqYaBLUiUMdEmqhIEuSZUw0CWpEga6JFXCQJekShjoklQJA12SKmGgS1IlDHRJqoSBLkmVMNAlqRIGuiRVYvFAe7fjaOC9wCLgPFp59jCKkiTN3OxH6O1YBHwAeClwMHAC7Th4SHVJkmZokCmX5wF30Mpv0cofAR8FjhlOWZKkmRpkymV/4M6O9buAn5vaKSJWA6vL6oMR8fUBzqmtlgH3zncRO4WI+a5A3XmPdogzB7pPn9ZPp0ECvVt1uU1D5hpgzQDnURcRMZ6ZY/NdhzQd79Edb5Apl7uAp3asPwW4e7ByJEmzNcgI/cvAgbTj6cD3gFcDraFUJUmasdmP0Fu5GTgF+DRwO3AJrbxtSHWpN6extLPzHt3BInObaW9J0gLkO0UlqRIGuiRVYpcP9IAtATd1fIzOd029BOwe8JlS76vmux7N3EK87+ZKwEjADQFfCfiF+a5nIRvsb7nU4aGEFfNdxAwdBixZgHVrq4V4382VI4GvJaya70IWul1+hN5NwKKAdwZ8OeDmgDeW9pUBnw24JOAbAWcHnBjwpYBbAg4o/S4I+FDAvwd8K+BFAecH3B5wQcd5jgq4PuDGgH8K2LO0nx3w1XLuc6bUti/wj8CKMrI7IGB9NO/KI2As4NqyfGbA2oCrSp9fD3hHqfVTAUtKv+eW61oX8OmA5aX92oCxsrwsYH1Zfla55ptKjQeW9pM62v+ufB0Xla/HreW8b5mzb9wCtzPfd2X7meV415bj/07HttPK9/jWgFNL22g597kBt5X7cOmUY64A3gG8rNw3SwMe7Nj+isnah3B90/2cvCi2PlL6SsBepf0POr4XZ5W2PQI+GfCf5Vp3rkfImblLf5C5hcybyselpW01mW8ry7uTOU7m08lcSeZ/k7m8tH+PzLNKv98l86/L8gVkfpTMIPMYMu8n89lk7kbmOjJXkLmMzOvI3KPs80dk/hmZ+5D5dTKjtO/dpeaVZF7Rsb6ezGVleYzMa8vymWR+jswlZB5K5g/JfGnZdimZx5ZtXyBzpLS/iszzy/K1ZI6V5WVkri/Lf0PmiWX5cWQuJfMgMi8nc0lp/yCZryXzuWRe3VHrNtezK36wMO+7M8u9sns5zvdp7p/nknkLmXuQuSeZt5F5GJmjZG4mc0XZ/xIyT+py3N8g8/0d6w92LL+CzAsGvb4ePyeXk3lEWd6TzMVkHkXmmnKu3ci8gswXkvlyMs/tqO+J830vdX445dL9oe9RwHMCXlHWn0gzCv0R8OWEDQAB3wSuKn1uAV7ccYzLEzKa9o3ZfCbgNpr50qfQ/JXKz0fT/3HA9cD9wP8B5wV8ErhiwOv714RHSh2LgE911DsK/CxwCHB1qWMR5fq243rgT6K5ho8n/Fc0D5ufSzOigWYktgm4HHhGwN/QXM9VXY+461mo990nEx4GHo7m+7sf8ALg0oT/Lef6OM1c+GXAtxNuKvuuY/DnCmZ7fdvzeeDdARfR3M93RfO9OAr4SumzJ8334j+AcwL+Crgim/WdhoHeXQC/nc2bpjobV9LczJMe7Vh/lMd+PR/u0qez3xbg6oQTupz8eTQB+WqaN2+9pEe9m9k6ffb4KdseBkh4NOCR3Pr3dibrCOC2hOf3e9yEdsANwK/QTNGcXI6zNuGPu1zPocAvA28Gjgde1+N6dlUL4b7rPOYWtt5D05naf+l0HTt0vjmm6/3MLK6P6e/ns8svsZcBXwz4RZpr+suEv5t6kGgGLi8D/jLgqoS393FNO4Rz6N19GnhTbJ1j/pmAPYZ8ji8CRwT8dDnHT5Tz7Ak8MeFKmrnIfp44W09zkwG8fIZ1fJ3mVQbPL3UsCXhWl+NOjhoJeAbwrYT30YzCngNcQzPfuW/ps0/A08qc5W4JHwP+FDh8hvXtShbafTfpOuDYcqw9gOMYbOS6MeCgaPLpuBnu2/X6yrb1dPk5CTgg4ZZsRt3jwDNpvhev65h/3z9g34CfBH6YzfNY57CT3c+O0Ls7j+bh243R/KaeAI4d5gkSJgJ+A/hIwO6l+W3AA8AnohlBBP09iXgW8OGA02lGzjOp40flIf77onmIvxj4a5qHsOfQPBH3GuDfOnZ7Fc0ToI8A9wBvT7gvmvqvKj+Ij9CMyB8C/j62Dh62GcHrxxbafTd5zBujeVLyS6XpvGyeXBydZZlvpZnyuRO4lRKqfdYy3fV9g+l/Tk6NZtpqC/BVmmnKhwMOonlyFZonak+i+UXxzmgeETwCvGmW1zgnfOu/JFXCKRdJqoSBLkmVMNAlqRIGuiRVwkCXpEoY6JJUCQNdkirx/yqHxfo7WWH9AAAAAElFTkSuQmCC\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "dark" + }, + "output_type": "display_data" + } + ], "source": [ "# Affichage sous la forme d'un graphe en barres dont la hauteur reflete des taux pour chacun des groupes. \n", "#\n", @@ -379,18 +726,34 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "On reprend l'analyse en opérant dans la 1ere tranche d'age [AgeMin:34] ans :\n" + ] + } + ], "source": [ "print (\"On reprend l'analyse en opérant dans la 1ere tranche d'age [AgeMin:34] ans :\")" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Nb de femmes dans la tranche d'age [ 18 :34] = 400\n" + ] + } + ], "source": [ "femmes_TrA1 = OriginalInputData[OriginalInputData[\"Age\"].between(int(AgeMin), 34)]\n", "Nb_femmes_TrA1 = femmes_TrA1.shape[0]\n", @@ -399,9 +762,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dans la tranche d'age [ 18 :34] : Nb Fumeuses= 181 vs Nb NonFumeuses= 219\n" + ] + } + ], "source": [ "Fumeuses_TrA1 = femmes_TrA1.loc[femmes_TrA1['Smoker'] == 'Yes']\n", "Nb_Fumeuses_TrA1 = Fumeuses_TrA1.shape[0]\n", @@ -411,9 +782,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Nb Fumeuses Mortes= 5 vs Nb Fumeuses Vivantes= 176\n" + ] + } + ], "source": [ "FumeusesVivantes_TrA1 = femmes_TrA1.loc[(femmes_TrA1['Smoker'] == 'Yes') & (femmes_TrA1['Status'] == 'Alive')]\n", "Nb_FumeusesVivantes_TrA1 = FumeusesVivantes_TrA1.shape[0]\n", @@ -423,9 +802,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 22, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Nb Non Fumeuses Mortes= 6 vs Nb Non Fumeuses Vivantes= 213\n" + ] + } + ], "source": [ "NonFumeusesVivantes_TrA1 = femmes_TrA1.loc[(femmes_TrA1['Smoker'] == 'No') & (femmes_TrA1['Status'] == 'Alive')]\n", "Nb_NonFumeusesVivantes_TrA1 = NonFumeusesVivantes_TrA1.shape[0]\n", @@ -435,9 +822,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 23, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dans la tranche d'age [18:34] : le Taux de mortalite des fumeuses = 2.76 \n" + ] + } + ], "source": [ "# Le taux de mortalité dans le groupe des femmes fumeuses dans la la tranche d'age [18:34] vaut :\n", "Fumeuses_TrA1_TxMortalite = (Nb_FumeusesMortes_TrA1)/Nb_Fumeuses_TrA1 * 100\n", @@ -446,9 +841,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 24, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dans la tranche d'age [18:34] : le Taux de mortalite des Non fumeuses = 2.74 \n" + ] + } + ], "source": [ "# Le taux de mortalité dans le groupe des femmes Non fumeuses dans la la tranche d'age [18:34] vaut :\n", "NonFumeuses_TrA1_TxMortalite = (Nb_NonFumeusesMortes_TrA1)/Nb_NonFumeuses_TrA1 * 100\n", @@ -457,9 +860,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 25, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Dans la tranche d'age [18:34] Nb_Vivantes Nb_Mortes Mortalité en %\n", + "0 Femmes fumeuses 176 5 2.762431\n", + "1 Femmes non fumeuses 213 6 2.739726\n" + ] + } + ], "source": [ "table1 = {\"Dans la tranche d'age [18:34]\": ['Femmes fumeuses', 'Femmes non fumeuses'],\n", " 'Nb_Vivantes': [Nb_FumeusesVivantes_TrA1, Nb_NonFumeusesVivantes_TrA1],\n", @@ -472,18 +885,34 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 26, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "On reprend l'analyse en opérant dans la 2nde tranche d'age ]34:54] ans :\n" + ] + } + ], "source": [ "print (\"On reprend l'analyse en opérant dans la 2nde tranche d'age ]34:54] ans :\")" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 27, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Nb de femmes dans la tranche d'age ]34:54] = 436\n" + ] + } + ], "source": [ "femmes_TrA2 = OriginalInputData[OriginalInputData[\"Age\"].between(float(34)+0.000001, 54)]\n", "Nb_femmes_TrA2 = femmes_TrA2.shape[0]\n", @@ -492,9 +921,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 28, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dans la tranche d'age ]34:54] : Nb Fumeuses= 237 vs Nb NonFumeuses= 199\n" + ] + } + ], "source": [ "Fumeuses_TrA2 = femmes_TrA2.loc[femmes_TrA2['Smoker'] == 'Yes']\n", "Nb_Fumeuses_TrA2 = Fumeuses_TrA2.shape[0]\n", @@ -504,9 +941,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 29, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Nb Fumeuses Mortes= 41 vs Nb Fumeuses Vivantes= 196\n" + ] + } + ], "source": [ "FumeusesVivantes_TrA2 = femmes_TrA2.loc[(femmes_TrA2['Smoker'] == 'Yes') & (femmes_TrA2['Status'] == 'Alive')]\n", "Nb_FumeusesVivantes_TrA2 = FumeusesVivantes_TrA2.shape[0]\n", @@ -516,21 +961,37 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 30, "metadata": {}, - "outputs": [], - "source": [ - "NonFumeusesVivantes_TrA2 = femmes_TrA2.loc[(femmes_TrA2['Smoker'] == 'No') & (femmes_TrA2['Status'] == 'Alive')]\n", - "Nb_NonFumeusesVivantes_TrA2 = NonFumeusesVivantes_TrA2.shape[0]\n", - "Nb_NonFumeusesMortes_TrA2 = Nb_NonFumeuses_TrA2 - Nb_NonFumeusesVivantes_TrA2\n", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Nb Non Fumeuses Mortes= 19 vs Nb Non Fumeuses Vivantes= 180\n" + ] + } + ], + "source": [ + "NonFumeusesVivantes_TrA2 = femmes_TrA2.loc[(femmes_TrA2['Smoker'] == 'No') & (femmes_TrA2['Status'] == 'Alive')]\n", + "Nb_NonFumeusesVivantes_TrA2 = NonFumeusesVivantes_TrA2.shape[0]\n", + "Nb_NonFumeusesMortes_TrA2 = Nb_NonFumeuses_TrA2 - Nb_NonFumeusesVivantes_TrA2\n", "print (\"Nb Non Fumeuses Mortes=\",Nb_NonFumeusesMortes_TrA2,\" vs Nb Non Fumeuses Vivantes=\",Nb_NonFumeusesVivantes_TrA2)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 31, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dans la tranche d'age ]34:54] : le Taux de mortalite des fumeuses = 17.30 \n" + ] + } + ], "source": [ "# Le taux de mortalité dans le groupe des femmes fumeuses dans la la tranche d'age ]34:54] vaut :\n", "Fumeuses_TrA2_TxMortalite = (Nb_FumeusesMortes_TrA2)/Nb_Fumeuses_TrA2 * 100\n", @@ -539,9 +1000,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 32, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dans la tranche d'age ]34:54] : le Taux de mortalite des Non fumeuses = 9.55 \n" + ] + } + ], "source": [ "# Le taux de mortalité dans le groupe des femmes Non fumeuses dans la la tranche d'age ]34:54] vaut :\n", "NonFumeuses_TrA2_TxMortalite = (Nb_NonFumeusesMortes_TrA2)/Nb_NonFumeuses_TrA2 * 100\n", @@ -550,9 +1019,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 33, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Dans la tranche d'age ]34:54] Nb_Vivantes Nb_Mortes Mortalité en %\n", + "0 Femmes fumeuses 196 41 17.299578\n", + "1 Femmes non fumeuses 180 19 9.547739\n" + ] + } + ], "source": [ "table2 = {\"Dans la tranche d'age ]34:54]\": ['Femmes fumeuses', 'Femmes non fumeuses'],\n", " 'Nb_Vivantes': [Nb_FumeusesVivantes_TrA2, Nb_NonFumeusesVivantes_TrA2],\n", @@ -565,18 +1044,34 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 62, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "On reprend l'analyse en opérant dans la 3eme tranche d'age ]54:64] ans :\n" + ] + } + ], "source": [ "print (\"On reprend l'analyse en opérant dans la 3eme tranche d'age ]54:64] ans :\")" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 63, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Nb de femmes dans la tranche d'age ]54:64] = 236\n" + ] + } + ], "source": [ "femmes_TrA3 = OriginalInputData[OriginalInputData[\"Age\"].between(float(54)+0.000001, 64)]\n", "Nb_femmes_TrA3 = femmes_TrA3.shape[0]\n", @@ -585,9 +1080,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 64, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dans la tranche d'age ]54:64] : Nb Fumeuses= 115 vs Nb NonFumeuses= 121\n" + ] + } + ], "source": [ "Fumeuses_TrA3 = femmes_TrA3.loc[femmes_TrA3['Smoker'] == 'Yes']\n", "Nb_Fumeuses_TrA3 = Fumeuses_TrA3.shape[0]\n", @@ -597,9 +1100,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 65, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Nb Fumeuses Mortes= 51 vs Nb Fumeuses Vivantes= 64\n" + ] + } + ], "source": [ "FumeusesVivantes_TrA3 = femmes_TrA3.loc[(femmes_TrA3['Smoker'] == 'Yes') & (femmes_TrA3['Status'] == 'Alive')]\n", "Nb_FumeusesVivantes_TrA3 = FumeusesVivantes_TrA3.shape[0]\n", @@ -609,9 +1120,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 66, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Nb Non Fumeuses Mortes= 40 vs Nb Non Fumeuses Vivantes= 81\n" + ] + } + ], "source": [ "NonFumeusesVivantes_TrA3 = femmes_TrA3.loc[(femmes_TrA3['Smoker'] == 'No') & (femmes_TrA3['Status'] == 'Alive')]\n", "Nb_NonFumeusesVivantes_TrA3 = NonFumeusesVivantes_TrA3.shape[0]\n", @@ -621,9 +1140,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 67, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dans la tranche d'age ]54:64] : le Taux de mortalite des fumeuses = 44.35 \n" + ] + } + ], "source": [ "# Le taux de mortalité dans le groupe des femmes fumeuses dans la tranche d'age ]54:64] vaut :\n", "Fumeuses_TrA3_TxMortalite = (Nb_FumeusesMortes_TrA3)/Nb_Fumeuses_TrA3 * 100\n", @@ -632,9 +1159,38 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 68, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dans la tranche d'age ]54:64] : le Taux de mortalite des Non fumeuses = 33.06 \n" + ] + } + ], + "source": [ + "# Le taux de mortalité dans le groupe des femmes Non fumeuses dans la la tranche d'age ]54:64] vaut :\n", + "NonFumeuses_TrA3_TxMortalite = (Nb_NonFumeusesMortes_TrA3)/Nb_NonFumeuses_TrA3 * 100\n", + "print (\"Dans la tranche d'age ]54:64] : le Taux de mortalite des Non fumeuses = % 4.2f \" % NonFumeuses_TrA3_TxMortalite)" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Dans la tranche d'age ]54:64] Nb_Vivantes Nb_Mortes Mortalité en %\n", + "0 Femmes fumeuses 64 51 44.347826\n", + "1 Femmes non fumeuses 81 40 33.057851\n" + ] + } + ], "source": [ "table3 = {\"Dans la tranche d'age ]54:64]\": ['Femmes fumeuses', 'Femmes non fumeuses'],\n", " 'Nb_Vivantes': [Nb_FumeusesVivantes_TrA3, Nb_NonFumeusesVivantes_TrA3],\n", @@ -647,18 +1203,34 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 70, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "On reprend l'analyse en opérant dans la 4eme tranche d'age ]64:AgeMax] ans :\n" + ] + } + ], "source": [ "print (\"On reprend l'analyse en opérant dans la 4eme tranche d'age ]64:AgeMax] ans :\")" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 71, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Nb de femmes dans la tranche d'age ]64: 89 ] = 242\n" + ] + } + ], "source": [ "femmes_TrA4 = OriginalInputData[OriginalInputData[\"Age\"].between(float(64)+0.000001, int(AgeMax)+1)]\n", "Nb_femmes_TrA4 = femmes_TrA4.shape[0]\n", @@ -667,9 +1239,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 72, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dans la tranche d'age ]64 89 ] : Nb Fumeuses= 49 vs Nb NonFumeuses= 193\n" + ] + } + ], "source": [ "Fumeuses_TrA4 = femmes_TrA4.loc[femmes_TrA4['Smoker'] == 'Yes']\n", "Nb_Fumeuses_TrA4 = Fumeuses_TrA4.shape[0]\n", @@ -679,9 +1259,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 73, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Nb Fumeuses Mortes= 42 vs Nb Fumeuses Vivantes= 7\n" + ] + } + ], "source": [ "FumeusesVivantes_TrA4 = femmes_TrA4.loc[(femmes_TrA4['Smoker'] == 'Yes') & (femmes_TrA4['Status'] == 'Alive')]\n", "Nb_FumeusesVivantes_TrA4 = FumeusesVivantes_TrA4.shape[0]\n", @@ -691,9 +1279,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 74, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Nb Non Fumeuses Mortes= 165 vs Nb Non Fumeuses Vivantes= 28\n" + ] + } + ], "source": [ "NonFumeusesVivantes_TrA4 = femmes_TrA4.loc[(femmes_TrA4['Smoker'] == 'No') & (femmes_TrA4['Status'] == 'Alive')]\n", "Nb_NonFumeusesVivantes_TrA4 = NonFumeusesVivantes_TrA4.shape[0]\n", @@ -703,9 +1299,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 75, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dans la tranche d'age ]64 89 ] : le Taux de mortalite des fumeuses = 85.71 \n" + ] + } + ], "source": [ "# Le taux de mortalité dans le groupe des femmes fumeuses dans la tranche d'age ]64:AgeMax] vaut :\n", "Fumeuses_TrA4_TxMortalite = (Nb_FumeusesMortes_TrA4)/Nb_Fumeuses_TrA4 * 100\n", @@ -714,9 +1318,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 76, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dans la tranche d'age ]64 89 ] : le Taux de mortalite des Non fumeuses = 85.49 \n" + ] + } + ], "source": [ "# Le taux de mortalité dans le groupe des femmes Non fumeuses dans la tranche d'age ]64:AgeMax] vaut :\n", "NonFumeuses_TrA4_TxMortalite = (Nb_NonFumeusesMortes_TrA4)/Nb_NonFumeuses_TrA4 * 100\n", @@ -725,9 +1337,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 77, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Dans la tranche d'age ]64:89] Nb_Vivantes Nb_Mortes Mortalité en %\n", + "0 Femmes fumeuses 7 42 85.714286\n", + "1 Femmes non fumeuses 28 165 85.492228\n" + ] + } + ], "source": [ "table4 = {\"Dans la tranche d'age ]64:89]\": ['Femmes fumeuses', 'Femmes non fumeuses'],\n", " 'Nb_Vivantes': [Nb_FumeusesVivantes_TrA4, Nb_NonFumeusesVivantes_TrA4],\n", @@ -747,9 +1369,28 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 78, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Dans la tranche d'age [18:34] Nb_Vivantes Nb_Mortes Mortalité en %\n", + "0 Femmes fumeuses 176 5 2.762431\n", + "1 Femmes non fumeuses 213 6 2.739726\n", + " Dans la tranche d'age ]34:54] Nb_Vivantes Nb_Mortes Mortalité en %\n", + "0 Femmes fumeuses 196 41 17.299578\n", + "1 Femmes non fumeuses 180 19 9.547739\n", + " Dans la tranche d'age ]54:64] Nb_Vivantes Nb_Mortes Mortalité en %\n", + "0 Femmes fumeuses 64 51 44.347826\n", + "1 Femmes non fumeuses 81 40 33.057851\n", + " Dans la tranche d'age ]64:89] Nb_Vivantes Nb_Mortes Mortalité en %\n", + "0 Femmes fumeuses 7 42 85.714286\n", + "1 Femmes non fumeuses 28 165 85.492228\n" + ] + } + ], "source": [ "print (Resume1)\n", "print (Resume2)\n", @@ -759,9 +1400,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 79, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "# Construire la série des valeurs pour la hauteur des barres :\n", "HauteursDesBarres = [Fumeuses_TrA1_TxMortalite,NonFumeuses_TrA1_TxMortalite,\n", @@ -822,23 +1474,133 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 80, "metadata": {}, "outputs": [], "source": [ "from sklearn.linear_model import LogisticRegression\n", "#\n", - "import statsmodels.discrete.discrete_model as sm" + "import statsmodels.api as sm" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 136, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SmokerStatusAge
0YesAlive21.0
1YesAlive19.3
2NoDead57.5
3NoAlive47.1
4YesAlive81.4
5NoAlive36.8
6NoAlive23.8
7YesDead57.5
8YesAlive24.8
9YesAlive49.5
\n", + "
" + ], + "text/plain": [ + " Smoker Status Age\n", + "0 Yes Alive 21.0\n", + "1 Yes Alive 19.3\n", + "2 No Dead 57.5\n", + "3 No Alive 47.1\n", + "4 Yes Alive 81.4\n", + "5 No Alive 36.8\n", + "6 No Alive 23.8\n", + "7 Yes Dead 57.5\n", + "8 Yes Alive 24.8\n", + "9 Yes Alive 49.5" + ] + }, + "execution_count": 136, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Rappel : OriginalInputData = pd.read_csv(\"../__DataSets/module3_Practical_session_Subject6_smoking.csv\")\n", - "OriginalInputData # .head(10) " + "OriginalInputData.head(10) " ] }, { @@ -850,130 +1612,436 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# a) selection des colonnes \"Smoker\" et \"Age\" (qui seront ''les variables explicatives'' de la mortalite) :\n", - "# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - "X = OriginalInputData[['Smoker','Age']]\n", - "X.head(5) # pour vérifier le contenu " - ] - }, - { - "cell_type": "code", - "execution_count": null, + "execution_count": 82, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "verification : finalement countF= 582 ? 582\n", + "verification : finalement countNF= 732 ? 732\n", + "NbDecesChezFumeuses= 139\n", + "NbDecesChezNonFumeuses= 230\n" + ] + } + ], "source": [ - "# Pour convertir les valeurs de la colonne 'Smoker' qui valent 'Yes' ou 'No' en entiers 1 ou 0 \n", - "# ( NB pour la suite, stockage dans des DataFrames separes pour les Fumeuses et pour les NonFumeuses )\n", - "# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + "# Pour la suite, stockage dans des DataFrames separes pour les Fumeuses et pour les NonFumeuses. \n", + "# Creation de Y_pourEtudeMortaliteFumeuses et Y_pourEtudeMortaliteNonFumeuses en comptant >0 (resp. <0) si 'Dead' (si 'Alive') \n", + "# convertion des valeurs de la colonne 'Smoker' qui valent 'Yes' ou 'No' en entiers 1 ou 0 \n", + "# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + "X = pd.DataFrame()\n", + "Y = pd.DataFrame()\n", "X_pourEtudeMortaliteFumeuses = pd.DataFrame() \n", "X_pourEtudeMortaliteNonFumeuses = pd.DataFrame()\n", "Y_pourEtudeMortaliteFumeuses = pd.DataFrame() \n", "Y_pourEtudeMortaliteNonFumeuses = pd.DataFrame()\n", "countF = 0\n", "countNF = 0\n", + "sommeDesYi = 0\n", + "NbDecesChezFumeuses = 0\n", + "NbDecesChezNonFumeuses = 0\n", "taille1Colonne = OriginalInputData['Smoker'].shape[0] # devrait etre egal à 1314\n", "for i in range(taille1Colonne) :\n", - " if OriginalInputData.loc[i,'Smoker'] == \"Yes\" : \n", + " if OriginalInputData.loc[i,'Smoker'] == 'Yes' : \n", " X.loc[i,'Smoker'] = 1 # conversion en entier\n", " X_pourEtudeMortaliteFumeuses.loc[countF,'Smoker'] = 1 \n", - " X_pourEtudeMortaliteFumeuses.loc[countF,'Age'] = OriginalInputData.loc[i,'Age'] \n", - " # Y_pourEtudeMortaliteFumeuses.loc[countF,'Status'] = OriginalInputData.loc[i,'Status'] :\n", - " if OriginalInputData.loc[i,'Status'] == \"Alive\" :\n", - " Y_pourEtudeMortaliteFumeuses.loc[countF,'Status'] = 1\n", + " X_pourEtudeMortaliteFumeuses.loc[countF,'Age'] = OriginalInputData.loc[i,'Age'] / AgeMax\n", + " if OriginalInputData.loc[i,'Status'] == 'Dead' : \n", + " Y_pourEtudeMortaliteFumeuses.loc[countF,'Status'] = 1 # la caracteristque Death est comptée POSITIVEMENT\n", + " NbDecesChezFumeuses += 1\n", " else : \n", - " Y_pourEtudeMortaliteFumeuses.loc[countF,'Status'] = 0\n", + " Y_pourEtudeMortaliteFumeuses.loc[countF,'Status'] = 0 # comptée nulle plutot que NEGATIVEMENT\n", " countF += 1\n", " else : \n", " X.loc[i,'Smoker'] = 0 # conversion en entier\n", " X_pourEtudeMortaliteNonFumeuses.loc[countNF,'Smoker'] = 0\n", - " X_pourEtudeMortaliteNonFumeuses.loc[countNF,'Age'] = OriginalInputData.loc[i,'Age']\n", - " # Y_pourEtudeMortaliteNonFumeuses.loc[countNF,'Status'] = OriginalInputData.loc[i,'Status'] :\n", - " if OriginalInputData.loc[i,'Status'] == \"Alive\" :\n", - " Y_pourEtudeMortaliteNonFumeuses.loc[countNF,'Status'] = 1 \n", + " X_pourEtudeMortaliteNonFumeuses.loc[countNF,'Age'] = OriginalInputData.loc[i,'Age'] / AgeMax\n", + " if OriginalInputData.loc[i,'Status'] == 'Dead' :\n", + " Y_pourEtudeMortaliteNonFumeuses.loc[countNF,'Status'] = 1\n", + " NbDecesChezNonFumeuses += 1\n", " else : \n", " Y_pourEtudeMortaliteNonFumeuses.loc[countNF,'Status'] = 0 \n", " countNF += 1\n", - "#print (\"verification : finalement countF=\",X_pourEtudeMortaliteFumeuses.shape[0],\" ? \",Nb_Fumeuses)\n", - "#print (\"verification : finalement countNF=\",X_pourEtudeMortaliteNonFumeuses.shape[0],\" ? \",Nb_NonFumeuses)\n", - "X_pourEtudeMortaliteFumeuses.describe()" + " X.loc[i,'Age'] = OriginalInputData.loc[i,'Age'] / AgeMax # si on veut avoir les ages dans l'intervalle [0:1]\n", + " if OriginalInputData.loc[i,'Status'] == 'Dead' :\n", + " Y.loc[i,'Status'] = 1\n", + " else :\n", + " Y.loc[i,'Status'] = 0\n", + " sommeDesYi += Y.loc[i,'Status'] \n", + "print (\"verification : finalement countF=\",X_pourEtudeMortaliteFumeuses.shape[0],\" ? \",Nb_Fumeuses)\n", + "print (\"verification : finalement countNF=\",X_pourEtudeMortaliteNonFumeuses.shape[0],\" ? \",Nb_NonFumeuses)\n", + "print (\"NbDecesChezFumeuses=\",NbDecesChezFumeuses)\n", + "print (\"NbDecesChezNonFumeuses=\",NbDecesChezNonFumeuses)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 83, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SmokerAge
count582.0582.000000
mean1.00.492433
std0.00.180399
min1.00.200222
25%1.00.348165
50%1.00.479422
75%1.00.624861
max1.00.992214
\n", + "
" + ], + "text/plain": [ + " Smoker Age\n", + "count 582.0 582.000000\n", + "mean 1.0 0.492433\n", + "std 0.0 0.180399\n", + "min 1.0 0.200222\n", + "25% 1.0 0.348165\n", + "50% 1.0 0.479422\n", + "75% 1.0 0.624861\n", + "max 1.0 0.992214" + ] + }, + "execution_count": 83, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "X_pourEtudeMortaliteNonFumeuses.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print (\"verif : somme des valeurs dans la colonne 'Smoker' = \",X['Smoker'].sum())" + "X_pourEtudeMortaliteFumeuses.describe()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 84, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SmokerAge
count732.0732.000000
mean0.00.554125
std0.00.232462
min0.00.200222
25%0.00.348999
50%0.00.538376
75%0.00.732481
max0.01.000000
\n", + "
" + ], + "text/plain": [ + " Smoker Age\n", + "count 732.0 732.000000\n", + "mean 0.0 0.554125\n", + "std 0.0 0.232462\n", + "min 0.0 0.200222\n", + "25% 0.0 0.348999\n", + "50% 0.0 0.538376\n", + "75% 0.0 0.732481\n", + "max 0.0 1.000000" + ] + }, + "execution_count": 84, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# b) selection de la colonne \"Status\" (vivante ou morte) qui sera ''la variable mortalité à prédire'' :\n", - "Y = OriginalInputData['Status'] # .astype(int) en principe MAIS ValueError: invalid literal for int() with base 10: 'Yes'!" + "X_pourEtudeMortaliteNonFumeuses.describe()" ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "raw", "metadata": {}, - "outputs": [], "source": [ - "# Vérifions le contenu de la sous-liste de données qui contient 2 données binaires 'Dead' ou 'Alive' :\n", - "Y.head(5) # pour vérifier le contenu\n", - "# Y.describe()" + "Remarque : la comparaison des estimations de l'age moyen des 2 groupes est une 1ere indication" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 85, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "verif : somme des valeurs dans la colonne 'Smoker' = 582.0\n" + ] + } + ], "source": [ - "# Pour convertir les valeurs de la colonne 'Status' qui valent 'Alive' ou 'Dead' en entiers 1 ou 0 :\n", - "# { remarque : Y = pd.to_numeric(OriginalInputData['Status'],errors='coerce') n'a pas marche }\n", - "# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - "taille1Colonne = OriginalInputData['Smoker'].shape[0] # devrait etre egal à 1314\n", - "for i in range(taille1Colonne) :\n", - " if OriginalInputData.loc[i,'Status'] == \"Alive\" : \n", - " Y.loc[i] = 1\n", - " else :\n", - " Y.loc[i] = 0" + "print (\"verif : somme des valeurs dans la colonne 'Smoker' = \",X['Smoker'].sum())" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 86, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SmokerAge
01.00.233593
11.00.214683
20.00.639600
30.00.523915
41.00.905451
\n", + "
" + ], + "text/plain": [ + " Smoker Age\n", + "0 1.0 0.233593\n", + "1 1.0 0.214683\n", + "2 0.0 0.639600\n", + "3 0.0 0.523915\n", + "4 1.0 0.905451" + ] + }, + "execution_count": 86, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "Y.head(5) # pour vérifier son contenu converti" + "# Vérifions le contenu de la sous-liste de données qui contient 2 données binaires 'Dead' ou 'Alive' :\n", + "X.head(5) # Y.describe()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 87, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Status
00.0
10.0
21.0
30.0
40.0
\n", + "
" + ], + "text/plain": [ + " Status\n", + "0 0.0\n", + "1 0.0\n", + "2 1.0\n", + "3 0.0\n", + "4 0.0" + ] + }, + "execution_count": 87, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "print (\"verif : somme des valeurs dans la colonne Status = \",Y.sum())" + "# Vérifions le contenu de la sous-liste de données qui contient 2 données binaires 'Dead' ou 'Alive' :\n", + "Y.head(5) # Y.describe()" ] }, { @@ -982,10 +2050,7 @@ "source": [ "Avertissement : \n", "* SciKit-Learn décide par défaut d’appliquer une régularisation sur le modèle. \n", - "* Dans le modèle que l'on va utiliser, on n'applique pas de pénalité et on prend un solver du type Newton qui est le plus classique pour la régression logistique.\n", - "* Pour comprendre les coefficients du modèle, SciKit-Learn stocke les informations dans .coef_, \n", - " il faut les afficher de manière plus agréable dans un DataFrame avec la constante du modèle ; \n", - " mais leur interprétation n'est pas évidente" + "* Dans le modèle que l'on va utiliser, on applique une pénalité de type 'l2' et on prend un solver du type Newton qui est le plus classique pour la régression logistique." ] }, { @@ -997,41 +2062,105 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 101, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "verif : somme des valeurs dans la colonne Y = 369.0\n" + ] + } + ], + "source": [ + "print (\"verif : somme des valeurs dans la colonne Y = \",sommeDesYi)" + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/conda/lib/python3.6/site-packages/sklearn/utils/validation.py:578: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", + " y = column_or_1d(y, warn=True)\n" + ] + } + ], "source": [ "# Rappel : la variable 'Smoker' represente ici A LA FOIS les fumeuses et non-fumeuses !!\n", - "if Y.sum() == 0 : # on verifie que Y contient plus de 1 classe\n", - " print (\"Probleme : la somme Y.sum() = \",Y.sum(),\" devrait être différente de 0 !\")\n", + "if abs(sommeDesYi) == 0 : # on verifie que Y contient plus de 1 classe\n", + " print (\"Probleme : la somme sommeDesYi = \",sommeDesYi,\" devrait être différente de 0 !\")\n", " print (\" les valeurs de Y ne composent qu'une seule classe !\")\n", "else : # La regression logistique peut être effectuée :\n", - " SKL_MRL_A = LogisticRegression(penalty='none',solver='newton-cg')\n", + " SKL_MRL_A = LogisticRegression(penalty='l2',solver='newton-cg')\n", " SKL_MRL_A.fit(X,Y)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 103, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[0.14128009, 7.3116638 ]])" + ] + }, + "execution_count": 103, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Afficher la valeur des coefficients pour ce modele : \n", + "SKL_MRL_A.coef_" + ] + }, + { + "cell_type": "raw", + "metadata": {}, + "source": [ + "La valeur du 1er coeff indique l'intensité de la probabilité de la mortalité en fonction de la caractéristique Tabagisme\n", + "La valeur du 2nd coeff indique l'intensité de la probabilité de la mortalité en fonction de la caractéristique Age" + ] + }, + { + "cell_type": "code", + "execution_count": 104, "metadata": {}, "outputs": [], "source": [ "# Si souhaité, afficher dans un DataFrame les coefficients du modèle , avec la constante :\n", - "if False :\n", - " pd.DataFrame(np.concatenate([SKL_MRL_A.intercept_.reshape(-1,1),\n", - " SKL_MRL_A.coef_],axis=1),\n", - " index = [\"coef\"],\n", - " columns = [\"constante\"]+list(X.columns)).T" + "#pd.DataFrame(np.concatenate([SKL_MRL_A.intercept_.reshape(-1,1),\n", + "# SKL_MRL_A.coef_],axis=1),\n", + "# index = [\"coef\"],\n", + "# columns = [\"constante\"]+list(X.columns)).T" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 105, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Regression Logistique de SciKitLearn sur l'ensemble des 2 groupes fumeuses + non-fumeuses : score= 0.851\n" + ] + } + ], "source": [ "score__SKL_MRL_A = SKL_MRL_A.score(X,Y)\n", - "print (\"Regression Logistique de SciKitLearn sur l'ensemble des 2 groupes fumeuses + non-fumeuses : score=\",score__SKL_MRL_A)" + "print (\"Regression Logistique de SciKitLearn sur l'ensemble des 2 groupes fumeuses + non-fumeuses : score= %5.3f\" %score__SKL_MRL_A)" ] }, { @@ -1045,25 +2174,103 @@ "cell_type": "raw", "metadata": {}, "source": [ - "# Attention : par defaut, le modele de regression logistique de 'statsmodels' n'inclue pas d'interception avec une valeur cte ;\n", - "# pour inclure cette option d'interception dans le modele, \n", - "# utiliser l'instruction 'statsmodels.tools.add_constant' pour ajouter la constant dans la matrice X" + "Attention : \n", + "par defaut, le modele de regression logistique de 'statsmodels' n'inclue pas d'interception avec une valeur cte ;\n", + "pour inclure cette option d'interception dans le modele, \n", + "utiliser l'instruction 'statsmodels.tools.add_constant' pour ajouter la constant dans la matrice X\n", + "( Remarque : pour comprendre l'utilité et la mise en oeuvre de cette notion, voir le lien suivant :\n", + "https://stats.stackexchange.com/questions/440242/statsmodels-logistic-regression-adding-intercept )" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 106, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Optimization terminated successfully.\n", + " Current function value: 0.629624\n", + " Iterations 5\n" + ] + } + ], "source": [ "MLRavecSM_A = sm.Logit(Y, X).fit()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 107, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "
Logit Regression Results
Dep. Variable: Status No. Observations: 1314
Model: Logit Df Residuals: 1312
Method: MLE Df Model: 1
Date: Sun, 12 Apr 2020 Pseudo R-squ.: -0.06046
Time: 17:50:18 Log-Likelihood: -827.33
converged: True LL-Null: -780.16
LLR p-value: 1.000
\n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "
coef std err z P>|z| [0.025 0.975]
Smoker -1.1611 0.114 -10.206 0.000 -1.384 -0.938
Age 0.0040 0.120 0.034 0.973 -0.231 0.239
" + ], + "text/plain": [ + "\n", + "\"\"\"\n", + " Logit Regression Results \n", + "==============================================================================\n", + "Dep. Variable: Status No. Observations: 1314\n", + "Model: Logit Df Residuals: 1312\n", + "Method: MLE Df Model: 1\n", + "Date: Sun, 12 Apr 2020 Pseudo R-squ.: -0.06046\n", + "Time: 17:50:18 Log-Likelihood: -827.33\n", + "converged: True LL-Null: -780.16\n", + " LLR p-value: 1.000\n", + "==============================================================================\n", + " coef std err z P>|z| [0.025 0.975]\n", + "------------------------------------------------------------------------------\n", + "Smoker -1.1611 0.114 -10.206 0.000 -1.384 -0.938\n", + "Age 0.0040 0.120 0.034 0.973 -0.231 0.239\n", + "==============================================================================\n", + "\"\"\"" + ] + }, + "execution_count": 107, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# MLRavecSM_A.params\n", "MLRavecSM_A.summary()" @@ -1078,9 +2285,96 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 108, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SmokerAge
count582.0582.000000
mean1.00.492433
std0.00.180399
min1.00.200222
25%1.00.348165
50%1.00.479422
75%1.00.624861
max1.00.992214
\n", + "
" + ], + "text/plain": [ + " Smoker Age\n", + "count 582.0 582.000000\n", + "mean 1.0 0.492433\n", + "std 0.0 0.180399\n", + "min 1.0 0.200222\n", + "25% 1.0 0.348165\n", + "50% 1.0 0.479422\n", + "75% 1.0 0.624861\n", + "max 1.0 0.992214" + ] + }, + "execution_count": 108, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Commencons par vérifier le contenu de X_pourEtudeMortaliteFumeuses et Y_pourEtudeMortaliteFumeuses\n", "X_pourEtudeMortaliteFumeuses.describe()\n", @@ -1089,9 +2383,87 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 109, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Status
count582.000000
mean0.238832
std0.426736
min0.000000
25%0.000000
50%0.000000
75%0.000000
max1.000000
\n", + "
" + ], + "text/plain": [ + " Status\n", + "count 582.000000\n", + "mean 0.238832\n", + "std 0.426736\n", + "min 0.000000\n", + "25% 0.000000\n", + "50% 0.000000\n", + "75% 0.000000\n", + "max 1.000000" + ] + }, + "execution_count": 109, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "Y_pourEtudeMortaliteFumeuses.describe()\n", "# Y_pourEtudeMortaliteFumeuses.head(10)" @@ -1099,9 +2471,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 111, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "taille1Colonne= 582\n" + ] + } + ], "source": [ "taille1Colonne = X_pourEtudeMortaliteFumeuses['Age'].shape[0] # devrait etre egal à 582\n", "print (\"taille1Colonne=\",taille1Colonne)" @@ -1109,12 +2489,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 112, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "verif : Nbre de deces chez les Fumeuses = 139 VS Nb_Fumeuses= 582\n" + ] + } + ], "source": [ - "# La somme des valeurs de 'Y_pourEtudeMortaliteFumeuses' renvoit le nb de deces chez les fumeuses :\n", - "NbDecesChezFumeuses = Y_pourEtudeMortaliteFumeuses.sum()\n", "print (\"verif : Nbre de deces chez les Fumeuses = \",NbDecesChezFumeuses,\" VS Nb_Fumeuses=\",Nb_Fumeuses)" ] }, @@ -1127,40 +2513,64 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 114, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/conda/lib/python3.6/site-packages/sklearn/utils/validation.py:578: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", + " y = column_or_1d(y, warn=True)\n" + ] + } + ], "source": [ - "if ( int(NbDecesChezFumeuses) == 0 ): # on verifie que Y_pourEtudeMortaliteFumeuses contient plus de 1 classe\n", + "if ( abs(NbDecesChezFumeuses) == 0 ): # on verifie que Y_pourEtudeMortaliteFumeuses contient plus de 1 classe\n", " print (\"Probleme : la somme Y_pourEtudeMortaliteFumeuses.sum() devrait être différente de 0 !\")\n", " print (\" les valeurs de Y_pourEtudeMortaliteFumeuses ne composent qu'une seule classe !\")\n", "else : # La regression logistique peut être effectuée :\n", - " SKL_MRL_B = LogisticRegression(penalty='none',solver='newton-cg')\n", + " SKL_MRL_B = LogisticRegression(penalty='l2',solver='newton-cg')\n", " SKL_MRL_B.fit(X_pourEtudeMortaliteFumeuses, Y_pourEtudeMortaliteFumeuses)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 115, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "array([[-3.64964135e-15, 5.37484238e+00]])" + ] + }, + "execution_count": 115, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Si souhaité, afficher dans un DataFrame les coefficients du modèle , avec la constante :\n", - "if False :\n", - " pd.DataFrame(np.concatenate([SKL_MRL_B.intercept_.reshape(-1,1),\n", - " SKL_MRL_B.coef_],axis=1),\n", - " index = [\"coef\"],\n", - " columns = [\"constante\"]+list(X_pourEtudeMortaliteFumeuses.columns)).T" + "# Afficher la valeur des coefficients pour ce modele : \n", + "SKL_MRL_B.coef_" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 119, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Modele de Regression Logistique sur le groupe des Fumeuses : score = 0.813\n" + ] + } + ], "source": [ "score__SKL_MRL_B = SKL_MRL_B.score(X_pourEtudeMortaliteFumeuses,Y_pourEtudeMortaliteFumeuses)\n", - "print (\"Modele de Regression Logistique sur le groupe des Fumeuses : score = \",score__SKL_MRL_B)" + "print (\"Modele de Regression Logistique sur le groupe des Fumeuses : score = %5.3f\" %score__SKL_MRL_B)" ] }, { @@ -1172,18 +2582,93 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 120, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Optimization terminated successfully.\n", + " Current function value: 0.412727\n", + " Iterations 7\n" + ] + } + ], "source": [ "MLRavecSM_B = sm.Logit(Y_pourEtudeMortaliteFumeuses, X_pourEtudeMortaliteFumeuses).fit()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 121, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "
Logit Regression Results
Dep. Variable: Status No. Observations: 582
Model: Logit Df Residuals: 580
Method: MLE Df Model: 1
Date: Sun, 12 Apr 2020 Pseudo R-squ.: 0.2492
Time: 17:53:54 Log-Likelihood: -240.21
converged: True LL-Null: -319.94
LLR p-value: 1.477e-36
\n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "
coef std err z P>|z| [0.025 0.975]
Smoker -5.5081 0.466 -11.814 0.000 -6.422 -4.594
Age 7.9990 0.784 10.203 0.000 6.462 9.536
" + ], + "text/plain": [ + "\n", + "\"\"\"\n", + " Logit Regression Results \n", + "==============================================================================\n", + "Dep. Variable: Status No. Observations: 582\n", + "Model: Logit Df Residuals: 580\n", + "Method: MLE Df Model: 1\n", + "Date: Sun, 12 Apr 2020 Pseudo R-squ.: 0.2492\n", + "Time: 17:53:54 Log-Likelihood: -240.21\n", + "converged: True LL-Null: -319.94\n", + " LLR p-value: 1.477e-36\n", + "==============================================================================\n", + " coef std err z P>|z| [0.025 0.975]\n", + "------------------------------------------------------------------------------\n", + "Smoker -5.5081 0.466 -11.814 0.000 -6.422 -4.594\n", + "Age 7.9990 0.784 10.203 0.000 6.462 9.536\n", + "==============================================================================\n", + "\"\"\"" + ] + }, + "execution_count": 121, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "MLRavecSM_B.summary()" ] @@ -1197,11 +2682,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 122, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "verif : Nbre de deces chez les Non Fumeuses = 230 VS Nb_NonFumeuses= 732\n" + ] + } + ], "source": [ - "NbDecesChezNonFumeuses = Y_pourEtudeMortaliteNonFumeuses.sum()\n", "print (\"verif : Nbre de deces chez les Non Fumeuses = \",NbDecesChezNonFumeuses,\" VS Nb_NonFumeuses=\",Nb_NonFumeuses)" ] }, @@ -1214,40 +2706,75 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 124, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/conda/lib/python3.6/site-packages/sklearn/utils/validation.py:578: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", + " y = column_or_1d(y, warn=True)\n" + ] + } + ], "source": [ - "if ( int(NbDecesChezNonFumeuses) == 0 ): # on verifie que Y_pourEtudeMortaliteNonFumeuses contient plus de 1 classe\n", + "if abs(NbDecesChezNonFumeuses) == 0 : # on verifie que Y_pourEtudeMortaliteNonFumeuses contient plus de 1 classe\n", " print (\"Probleme : la somme Y_pourEtudeMortaliteNonFumeuses.sum() devrait être différente de 0 !\")\n", " print (\" les valeurs de Y_pourEtudeMortaliteNonFumeuses ne composent qu'une seule classe !\")\n", "else : # La regression logistique peut être effectuée :\n", - " SKL_MRL_C = LogisticRegression(penalty='none',solver='newton-cg')\n", + " SKL_MRL_C = LogisticRegression(penalty='l2',solver='newton-cg')\n", " SKL_MRL_C.fit(X_pourEtudeMortaliteNonFumeuses,Y_pourEtudeMortaliteNonFumeuses)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 125, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "array([[0. , 7.04605784]])" + ] + }, + "execution_count": 125, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Si souhaité, afficher dans un DataFrame les coefficients du modèle , avec la constante :\n", - "if False :\n", - " pd.DataFrame(np.concatenate([SKL_MRL_C.intercept_.reshape(-1,1),\n", - " SKL_MRL_C.coef_],axis=1),\n", - " index = [\"coef\"],\n", - " columns = [\"constante\"]+list(X_pourEtudeMortaliteNonFumeuses.columns)).T" + "# Afficher la valeur des coefficients pour ce modele : \n", + "SKL_MRL_C.coef_" + ] + }, + { + "cell_type": "raw", + "metadata": {}, + "source": [ + "La valeur du 1er coeff indique l'intensité de la probabilité de la mortalité en fonction de la caractéristique Tabagisme\n", + "La valeur du 2nd coeff indique l'intensité de la probabilité de la mortalité en fonction de la caractéristique Age\n", + "\n", + "On note le coeff nul pour la probabilité de la mortalité en fonction de la caractéristique Tabagisme : ce qui est un résultat attendu , puisqu'il s'agit du groupe des non fumeuses ; \n", + "tout se passe comme si la mortalité était seulement le fait de l'age (aucune autre cause n'étant considérée ici)." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 127, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Modele de Regression Logistique sur le groupe des Non Fumeuses : score = 0.873\n" + ] + } + ], "source": [ "score__SKL_MRL_C = SKL_MRL_C.score(X_pourEtudeMortaliteNonFumeuses,Y_pourEtudeMortaliteNonFumeuses)\n", - "print (\"Modele de Regression Logistique sur le groupe des Non Fumeuses : score = \",score__SKL_MRL_C)" + "print (\"Modele de Regression Logistique sur le groupe des Non Fumeuses : score = %5.3f\" %score__SKL_MRL_C)" ] }, { @@ -1259,22 +2786,60 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 129, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Optimization terminated successfully.\n", + " Current function value: 0.687904\n", + " Iterations 4\n" + ] + }, + { + "ename": "LinAlgError", + "evalue": "Singular matrix", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mLinAlgError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mMLRavecSM_C\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msm\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mLogit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mY_pourEtudeMortaliteNonFumeuses\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mX_pourEtudeMortaliteNonFumeuses\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0;31m# Mais erreur de type 'Singular Matrix Error' !\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;31m# voir https://stackoverflow.com/questions/20703733/logit-regression-and-singular-matrix-error-in-python\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;31m# En fait cette erreur est due au fait que\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;31m# tout se passe comme s'il y a redondance entre les 2 caractéristiques 'Age' et 'Mortalité' .\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/statsmodels/discrete/discrete_model.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, start_params, method, maxiter, full_output, disp, callback, **kwargs)\u001b[0m\n\u001b[1;32m 1832\u001b[0m bnryfit = super(Logit, self).fit(start_params=start_params,\n\u001b[1;32m 1833\u001b[0m \u001b[0mmethod\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmaxiter\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmaxiter\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfull_output\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfull_output\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1834\u001b[0;31m disp=disp, callback=callback, **kwargs)\n\u001b[0m\u001b[1;32m 1835\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1836\u001b[0m \u001b[0mdiscretefit\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mLogitResults\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbnryfit\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/statsmodels/discrete/discrete_model.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, start_params, method, maxiter, full_output, disp, callback, **kwargs)\u001b[0m\n\u001b[1;32m 218\u001b[0m mlefit = super(DiscreteModel, self).fit(start_params=start_params,\n\u001b[1;32m 219\u001b[0m \u001b[0mmethod\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmaxiter\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmaxiter\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfull_output\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfull_output\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 220\u001b[0;31m disp=disp, callback=callback, **kwargs)\n\u001b[0m\u001b[1;32m 221\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 222\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mmlefit\u001b[0m \u001b[0;31m# up to subclasses to wrap results\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/statsmodels/base/model.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, start_params, method, maxiter, full_output, disp, fargs, callback, retall, skip_hessian, **kwargs)\u001b[0m\n\u001b[1;32m 471\u001b[0m \u001b[0mHinv\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcov_params_func\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mxopt\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mretvals\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 472\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mmethod\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'newton'\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mfull_output\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 473\u001b[0;31m \u001b[0mHinv\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlinalg\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0mretvals\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'Hessian'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m/\u001b[0m \u001b[0mnobs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 474\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mskip_hessian\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 475\u001b[0m \u001b[0mH\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m \u001b[0;34m*\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhessian\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mxopt\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/numpy/linalg/linalg.py\u001b[0m in \u001b[0;36minv\u001b[0;34m(a)\u001b[0m\n\u001b[1;32m 530\u001b[0m \u001b[0msignature\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'D->D'\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0misComplexType\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mt\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0;34m'd->d'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 531\u001b[0m \u001b[0mextobj\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_linalg_error_extobj\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m_raise_linalgerror_singular\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 532\u001b[0;31m \u001b[0mainv\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_umath_linalg\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msignature\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0msignature\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mextobj\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mextobj\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 533\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mwrap\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mainv\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mastype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresult_t\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 534\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/numpy/linalg/linalg.py\u001b[0m in \u001b[0;36m_raise_linalgerror_singular\u001b[0;34m(err, flag)\u001b[0m\n\u001b[1;32m 87\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 88\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_raise_linalgerror_singular\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0merr\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mflag\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 89\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mLinAlgError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Singular matrix\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 90\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 91\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_raise_linalgerror_nonposdef\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0merr\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mflag\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mLinAlgError\u001b[0m: Singular matrix" + ] + } + ], "source": [ - "MLRavecSM_C = sm.Logit(Y_pourEtudeMortaliteNonFumeuses, X_pourEtudeMortaliteNonFumeuses).fit() # en principe \n", + "MLRavecSM_C = sm.Logit(Y_pourEtudeMortaliteNonFumeuses, X_pourEtudeMortaliteNonFumeuses).fit() \n", "# Mais erreur de type 'Singular Matrix Error' !\n", "# voir https://stackoverflow.com/questions/20703733/logit-regression-and-singular-matrix-error-in-python\n", - "# Pour contourner le probleme, soustraction selon l'explication et l'astuce fournie \n", - "# MLRavecSM_C = sm.Logit(Y_pourEtudeMortaliteNonFumeuses, X_pourEtudeMortaliteNonFumeuses-1).fit()" + "# En fait cette erreur est due au fait que \n", + "# tout se passe comme s'il y a redondance entre les 2 caractéristiques 'Age' et 'Mortalité' ." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 130, "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'MLRavecSM_C' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mMLRavecSM_C\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msummary\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;31mNameError\u001b[0m: name 'MLRavecSM_C' is not defined" + ] + } + ], "source": [ "MLRavecSM_C.summary()" ] @@ -1295,9 +2860,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 131, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Groupes Nb_Vivantes Nb_Mortes Score_Mod.Regr.Logistique\n", + "0 femmes fumeuses et non 945 369 0.850837\n", + "1 femmes fumeuses 443 139 0.812715\n", + "2 femmes non fumeuses 502 230 0.872951\n" + ] + } + ], "source": [ "tableMRL = {\"Groupes\": ['femmes fumeuses et non', 'femmes fumeuses', 'femmes non fumeuses'],\n", " 'Nb_Vivantes': [Nb_FumeusesVivantes+Nb_NonFumeusesVivantes, Nb_FumeusesVivantes, Nb_NonFumeusesVivantes],\n", @@ -1322,14 +2898,13 @@ "Commençons par rappeler que le même modele de régression logistique (de base mais très classique, sans inclure d'interception avec une constante, issu des librairies \"scikit learn\" d'une part \"statmodels\" d'autre part) a été utilisé pour les 3 types de groupes, en considérant tous ensemble les différents ages (afin de s'affranchir d'un biais induit par des regroupements en tranches d'âges arbitraires et non régulières) ; et que le score reflète la capacité de ce modèle de régression logistique à prédire la mortalité en fonction de l'age.\n", " \n", "De façon différente mais cohérente avec l'utilisation du modele de Regression Logistique issu de la librairie \"scikit learn\", les performances dans l'optimization (fit) du modele de Regression Logistique issu de la librairie \"statmodels\" terminée avec succes sont rappelées ci-dessous pour les 2 groupes traités séparemment :\n", - " * Fumeuses : en 7 itérations : valeur de la fonction = 0.412727 , coef de corrl avec Age = -0.0890 , std err = 0.009\n", - " * NonFumeuses : en 3 itérations : valeur de la fonction = 0.687904 , coef de corrl avec Age = -0.1073 , std err = 0.008\n", - "(remarque : les coefficients de corrélation entre les variables 'Age' et 'Status' sont normalement négatifs car ils reflètent le fait de tendances opposées: l'augmentation de l'age est corrélée avec la diminution des chances d'être en vie : pour l'intensité de la corrélation, il faut donc considérer la valeur absolue de ce coefficient). \n", + " * Fumeuses : en 7 itérations : score~0.81, valeur de la fonction ~0.41 , coef probabilité Mortalité avec l'Age = 5.37\n", + " * NonFumeuses : en 4 itérations : score~0.87, valeur de la fonction ~0.69 , coef probabilité Mortalité avec l'Age = 7.04\n", + "(remarque : les coefficients de probabilté entre les variables 'Age' et 'Status' sont normalement positifs car ils reflètent le fait que l'on a compté POSITIVEMENT la mortalité ; or celle-ci augmente avec l'age). \n", "\n", - "D'ailleurs, les scores 'TRAINING r_score' (cf la partie V) qui refletent les performances dans l'optimization (fit) du modele de Regression Linéaire confirment les observations et interpretations faites avec les essais de Regression Logistique.\n", + "Les scores 'TRAINING r_score' (cf la partie V) qui refletent les performances dans l'optimization (fit) du modele de Regression Linéaire confirment les observations et interpretations faites avec les essais de Regression Logistique.\n", "\n", - "On observe que le score du modèle (autrement dit, la probabilité de décès en fonction de l'âge) pour le groupe des femmes non fumeuses est plus élevé que le score pour le groupe des fumeuses ; \n", - "Ceci s'interprete par le fait que, par-rapport aux femmes non fumeuses, les fumeuses ont une probalité plus grande de mourir en raison d'un autre facteur que l'age ; étant donné que dans cette étude , ne sont pris en compte que 2 facteurs explicatifs de la mortalité: le tabagisme et l'age, ces régressions logistiques permettent de conclure sur la nocivité du tabagisme d'une part. " + "Ceci s'interprete par le fait que, par-rapport aux femmes non fumeuses, les fumeuses ont une probalité plus grande de mourir en raison d'un autre facteur que l'age ; étant donné que dans cette étude , ne sont pris en compte que 2 facteurs explicatifs de la mortalité: le tabagisme et l'age, ces régressions logistiques permettent de conclure sur la nocivité du tabagisme." ] }, { @@ -1341,7 +2916,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 132, "metadata": {}, "outputs": [], "source": [ @@ -1352,15 +2927,23 @@ " linearModel = LinearRegression(normalize=True).fit(X, Y)\n", " linearModel_trainingScore = linearModel.score(X, Y)\n", " print (\"LinearRegressionModel : TRAINING r_score = \",linearModel_trainingScore)\n", - "\n", + " #\n", " return (linearModel_trainingScore)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 133, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "LinearRegressionModel : TRAINING r_score = 0.25916536274977575\n" + ] + } + ], "source": [ "# Testons le modele de Regression lineaire 'Statut Vivante_ou_Morte' VS 'Age' chez les Fumeuses :\n", "linearModel_trainingScore = buildLinearRegressionModel(X_pourEtudeMortaliteFumeuses,Y_pourEtudeMortaliteFumeuses)" @@ -1368,9 +2951,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 134, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "LinearRegressionModel : TRAINING r_score = 0.44864670335441903\n" + ] + } + ], "source": [ "# Testons le modele de Regression lineaire 'Statut Vivante_ou_Morte' VS 'Age' chez les Non Fumeuses:\n", "linearModel_trainingScore = buildLinearRegressionModel(X_pourEtudeMortaliteNonFumeuses,Y_pourEtudeMortaliteNonFumeuses)" @@ -1384,6 +2975,114 @@ "ni sur la qualité toute relative des scores obtenus, \n", "on retrouve l'observation faite précédemment avec la regression logistique : à savoir, la probabilité de décès en fonction de l'âge pour le groupe des femmes non fumeuses est plus élevée que cette probabilité pour le groupe des fumeuses." ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## VI - Prolongement avec un calcul des corrélations séparemment sur les 2 groupes" + ] + }, + { + "cell_type": "code", + "execution_count": 135, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Correlations entre les variables pour les données concernant seulement le groupe des Fumeuses :\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AgeStatus
Age1.0000000.509083
Status0.5090831.000000
\n", + "
" + ], + "text/plain": [ + " Age Status\n", + "Age 1.000000 0.509083\n", + "Status 0.509083 1.000000" + ] + }, + "execution_count": 135, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "print (\"Correlations entre les variables pour les données concernant seulement le groupe des Fumeuses :\")\n", + "DataConcatenee = X_pourEtudeMortaliteFumeuses.drop(['Smoker'], axis=1)\n", + "DataConcatenee['Status'] = Y_pourEtudeMortaliteFumeuses # on ajoute la colonne 'Status'\n", + "# DataConcatenee # pour vérifier\n", + "Fumeuses_Corrs = DataConcatenee.corr()\n", + "Fumeuses_Corrs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print (\"Correlations entre les variables pour les données concernant seulement le groupe des Non Fumeuses :\")\n", + "DataConcatenee = X_pourEtudeMortaliteNonFumeuses.drop(['Smoker'], axis=1) \n", + "DataConcatenee['Status'] = Y_pourEtudeMortaliteNonFumeuses # on ajoute la colonne 'Status'\n", + "# DataConcatenee # pour vérifier\n", + "NonFumeuses_Corrs = DataConcatenee.corr()\n", + "NonFumeuses_Corrs" + ] + }, + { + "cell_type": "raw", + "metadata": {}, + "source": [ + "Sans explication sur l'application à l'identique aux 2 groupes de femmes ni sur la qualité relative des coefficients de correlation obtenus,\n", + "on retrouve l'observation faite précédemment avec les modèles de regression : à savoir, la corrélation entre les 2 variables [mortalité] et [âge] est plus forte pour le groupe des femmes non fumeuses que pour le groupe des fumeuses. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": {