From 8787a94c8dc51509a4abe8f3a89867ed73309d2b Mon Sep 17 00:00:00 2001 From: 7404ea6678ce6fbf3a726e36f2bf2079 <7404ea6678ce6fbf3a726e36f2bf2079@app-learninglab.inria.fr> Date: Fri, 4 Oct 2024 20:27:06 +0000 Subject: [PATCH] Ongoing work on question 2 --- module3/exo3/exercice_fr.ipynb | 433 ++++++++++++++++++++++++--------- 1 file changed, 315 insertions(+), 118 deletions(-) diff --git a/module3/exo3/exercice_fr.ipynb b/module3/exo3/exercice_fr.ipynb index 7458038..b88de3f 100644 --- a/module3/exo3/exercice_fr.ipynb +++ b/module3/exo3/exercice_fr.ipynb @@ -185,7 +185,8 @@ "metadata": {}, "outputs": [], "source": [ - "import re" + "import re\n", + "import copy" ] }, { @@ -344,7 +345,11 @@ " # la seconde partie définit le personnage du clerc\n", " # et commence par \" et \" \n", " emptyPersoDict[parts[0]] = persoCaracs\n", - " emptyPersoDict[parts[1][4:]] = persoCaracs\n", + " # On fait une copie profonde pour ne pas\n", + " # que l'objet partage la même adresse mémoire\n", + " clercCaracs = copy.deepcopy(persoCaracs)\n", + " clercCaracs[\"links\"].append(\"assistant du commissaire\")\n", + " emptyPersoDict[parts[1][4:]] = clercCaracs\n", " \n", " currentLine = fileToAnalyse.readline()\n", " lineNum += 1\n", @@ -624,7 +629,13 @@ " # La ligne n'est pas vide, ce n'est pas l'exception\n", " for namePerso in persoDict:\n", " # On vérifie si namePerso fait partie des protagonistes\n", - " m = re.search(namePerso, currentLine, re.IGNORECASE)\n", + " # Pour prendre en compte le problème avec les maitres/maîtres\n", + " # il faut adapter name Perso pour être une expression régulière\n", + " persoRegex = namePerso\n", + " if namePerso.startswith(\"Maitre\"):\n", + " persoRegex = namePerso.replace('Maitre','Maître')\n", + " \n", + " m = re.search(persoRegex, currentLine, re.IGNORECASE)\n", " if m is not None:\n", " # Le résultat de la recherche n'est pas vide,\n", " # le personnage fait partie des protagonistes\n", @@ -933,7 +944,7 @@ " ...\n", " \n", " \n", - " 841\n", + " 931\n", " 5\n", " Harpagon\n", " Valère\n", @@ -941,31 +952,23 @@ " 2\n", " \n", " \n", - " 842\n", + " 932\n", " 5\n", " Valère\n", - " Harpagon\n", - " 5\n", - " 4\n", - " \n", - " \n", - " 843\n", - " 5\n", - " Harpagon\n", - " Valère\n", + " Maître Jacques\n", " 5\n", - " 2\n", + " 6\n", " \n", " \n", - " 844\n", + " 933\n", " 5\n", - " Valère\n", + " Maître Jacques\n", " Harpagon\n", " 5\n", - " 13\n", + " 7\n", " \n", " \n", - " 845\n", + " 934\n", " 5\n", " Harpagon\n", " Valère\n", @@ -973,7 +976,7 @@ " 10\n", " \n", " \n", - " 846\n", + " 935\n", " 5\n", " Valère\n", " Harpagon\n", @@ -981,7 +984,7 @@ " 10\n", " \n", " \n", - " 847\n", + " 936\n", " 5\n", " Harpagon\n", " Valère\n", @@ -989,7 +992,7 @@ " 9\n", " \n", " \n", - " 848\n", + " 937\n", " 5\n", " Cléante\n", " Harpagon\n", @@ -997,7 +1000,7 @@ " 41\n", " \n", " \n", - " 849\n", + " 938\n", " 5\n", " Harpagon\n", " Cléante\n", @@ -1005,7 +1008,7 @@ " 3\n", " \n", " \n", - " 850\n", + " 939\n", " 5\n", " Cléante\n", " Harpagon\n", @@ -1013,7 +1016,7 @@ " 47\n", " \n", " \n", - " 851\n", + " 940\n", " 5\n", " Harpagon\n", " Cléante\n", @@ -1021,7 +1024,7 @@ " 7\n", " \n", " \n", - " 852\n", + " 941\n", " 5\n", " Cléante\n", " Mariane\n", @@ -1029,7 +1032,7 @@ " 36\n", " \n", " \n", - " 853\n", + " 942\n", " 5\n", " Mariane\n", " Anselme\n", @@ -1037,7 +1040,7 @@ " 36\n", " \n", " \n", - " 854\n", + " 943\n", " 5\n", " Anselme\n", " Harpagon\n", @@ -1045,7 +1048,7 @@ " 61\n", " \n", " \n", - " 855\n", + " 944\n", " 5\n", " Harpagon\n", " Cléante\n", @@ -1053,7 +1056,7 @@ " 11\n", " \n", " \n", - " 856\n", + " 945\n", " 5\n", " Cléante\n", " Harpagon\n", @@ -1061,7 +1064,7 @@ " 6\n", " \n", " \n", - " 857\n", + " 946\n", " 5\n", " Harpagon\n", " Anselme\n", @@ -1069,7 +1072,7 @@ " 13\n", " \n", " \n", - " 858\n", + " 947\n", " 5\n", " Anselme\n", " Harpagon\n", @@ -1077,7 +1080,7 @@ " 13\n", " \n", " \n", - " 859\n", + " 948\n", " 5\n", " Harpagon\n", " Anselme\n", @@ -1085,7 +1088,7 @@ " 12\n", " \n", " \n", - " 860\n", + " 949\n", " 5\n", " Anselme\n", " Harpagon\n", @@ -1093,7 +1096,7 @@ " 8\n", " \n", " \n", - " 861\n", + " 950\n", " 5\n", " Harpagon\n", " Anselme\n", @@ -1101,7 +1104,7 @@ " 12\n", " \n", " \n", - " 862\n", + " 951\n", " 5\n", " Anselme\n", " le Commissaire\n", @@ -1109,7 +1112,7 @@ " 13\n", " \n", " \n", - " 863\n", + " 952\n", " 5\n", " le Commissaire\n", " Harpagon\n", @@ -1117,7 +1120,7 @@ " 14\n", " \n", " \n", - " 864\n", + " 953\n", " 5\n", " Harpagon\n", " le Commissaire\n", @@ -1125,7 +1128,7 @@ " 8\n", " \n", " \n", - " 865\n", + " 954\n", " 5\n", " le Commissaire\n", " Harpagon\n", @@ -1133,15 +1136,23 @@ " 12\n", " \n", " \n", - " 866\n", + " 955\n", " 5\n", " Harpagon\n", + " Maître Jacques\n", + " 6\n", + " 12\n", + " \n", + " \n", + " 956\n", + " 5\n", + " Maître Jacques\n", " Anselme\n", " 6\n", - " 35\n", + " 23\n", " \n", " \n", - " 867\n", + " 957\n", " 5\n", " Anselme\n", " Harpagon\n", @@ -1149,7 +1160,7 @@ " 8\n", " \n", " \n", - " 868\n", + " 958\n", " 5\n", " Harpagon\n", " Anselme\n", @@ -1157,7 +1168,7 @@ " 5\n", " \n", " \n", - " 869\n", + " 959\n", " 5\n", " Anselme\n", " Harpagon\n", @@ -1165,7 +1176,7 @@ " 11\n", " \n", " \n", - " 870\n", + " 960\n", " 5\n", " Harpagon\n", " Anselme\n", @@ -1174,7 +1185,7 @@ " \n", " \n", "\n", - "

871 rows × 5 columns

\n", + "

961 rows × 5 columns

\n", "" ], "text/plain": [ @@ -1210,38 +1221,38 @@ "28 1 Cléante Élise 2 150\n", "29 1 Élise Cléante 2 27\n", ".. ... ... ... ... ...\n", - "841 5 Harpagon Valère 5 2\n", - "842 5 Valère Harpagon 5 4\n", - "843 5 Harpagon Valère 5 2\n", - "844 5 Valère Harpagon 5 13\n", - "845 5 Harpagon Valère 5 10\n", - "846 5 Valère Harpagon 5 10\n", - "847 5 Harpagon Valère 5 9\n", - "848 5 Cléante Harpagon 6 41\n", - "849 5 Harpagon Cléante 6 3\n", - "850 5 Cléante Harpagon 6 47\n", - "851 5 Harpagon Cléante 6 7\n", - "852 5 Cléante Mariane 6 36\n", - "853 5 Mariane Anselme 6 36\n", - "854 5 Anselme Harpagon 6 61\n", - "855 5 Harpagon Cléante 6 11\n", - "856 5 Cléante Harpagon 6 6\n", - "857 5 Harpagon Anselme 6 13\n", - "858 5 Anselme Harpagon 6 13\n", - "859 5 Harpagon Anselme 6 12\n", - "860 5 Anselme Harpagon 6 8\n", - "861 5 Harpagon Anselme 6 12\n", - "862 5 Anselme le Commissaire 6 13\n", - "863 5 le Commissaire Harpagon 6 14\n", - "864 5 Harpagon le Commissaire 6 8\n", - "865 5 le Commissaire Harpagon 6 12\n", - "866 5 Harpagon Anselme 6 35\n", - "867 5 Anselme Harpagon 6 8\n", - "868 5 Harpagon Anselme 6 5\n", - "869 5 Anselme Harpagon 6 11\n", - "870 5 Harpagon Anselme 6 6\n", + "931 5 Harpagon Valère 5 2\n", + "932 5 Valère Maître Jacques 5 6\n", + "933 5 Maître Jacques Harpagon 5 7\n", + "934 5 Harpagon Valère 5 10\n", + "935 5 Valère Harpagon 5 10\n", + "936 5 Harpagon Valère 5 9\n", + "937 5 Cléante Harpagon 6 41\n", + "938 5 Harpagon Cléante 6 3\n", + "939 5 Cléante Harpagon 6 47\n", + "940 5 Harpagon Cléante 6 7\n", + "941 5 Cléante Mariane 6 36\n", + "942 5 Mariane Anselme 6 36\n", + "943 5 Anselme Harpagon 6 61\n", + "944 5 Harpagon Cléante 6 11\n", + "945 5 Cléante Harpagon 6 6\n", + "946 5 Harpagon Anselme 6 13\n", + "947 5 Anselme Harpagon 6 13\n", + "948 5 Harpagon Anselme 6 12\n", + "949 5 Anselme Harpagon 6 8\n", + "950 5 Harpagon Anselme 6 12\n", + "951 5 Anselme le Commissaire 6 13\n", + "952 5 le Commissaire Harpagon 6 14\n", + "953 5 Harpagon le Commissaire 6 8\n", + "954 5 le Commissaire Harpagon 6 12\n", + "955 5 Harpagon Maître Jacques 6 12\n", + "956 5 Maître Jacques Anselme 6 23\n", + "957 5 Anselme Harpagon 6 8\n", + "958 5 Harpagon Anselme 6 5\n", + "959 5 Anselme Harpagon 6 11\n", + "960 5 Harpagon Anselme 6 6\n", "\n", - "[871 rows x 5 columns]" + "[961 rows x 5 columns]" ] }, "execution_count": 13, @@ -1281,50 +1292,10 @@ "cell_type": "markdown", "metadata": {}, "source": [ + "## Réponse à la première question\n", "Maintenant, nous pouvons répondre à la première question." ] }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Harpagon a prononcé 6992 mots dans toute la pièce.\n", - "Cléante a prononcé 3555 mots dans toute la pièce.\n", - "Élise a prononcé 1067 mots dans toute la pièce.\n", - "Valère a prononcé 3016 mots dans toute la pièce.\n", - "Mariane a prononcé 910 mots dans toute la pièce.\n", - "Anselme a prononcé 517 mots dans toute la pièce.\n", - "Frosine a prononcé 2381 mots dans toute la pièce.\n", - "Maitre Simon a prononcé 0 mots dans toute la pièce.\n", - "Maitre Jacques a prononcé 0 mots dans toute la pièce.\n", - "La Flèche a prononcé 1520 mots dans toute la pièce.\n", - "Dame Claude a prononcé 0 mots dans toute la pièce.\n", - "Brindavoine a prononcé 43 mots dans toute la pièce.\n", - "La Merluche a prononcé 47 mots dans toute la pièce.\n", - "Le commissaire a prononcé 409 mots dans toute la pièce.\n", - "son clerc a prononcé 0 mots dans toute la pièce.\n" - ] - } - ], - "source": [ - "# On va parcourir le dictionnaire des personnages\n", - "# et compter le nombre total de mots prononcés\n", - "# pour chacun.\n", - "import numpy as np\n", - "persoList = []\n", - "persoNbWords = np.zeros(len(avarePersoDict),dtype=int)\n", - "#print(textDataSynthesisTableDf.shape)\n", - "for perso in avarePersoDict:\n", - " m = textDataSynthesisTableDf['author'].str.match(perso, case=False, na=False)\n", - " tmpDf = textDataSynthesisTableDf[m].copy()\n", - " print(\"{} a prononcé {} mots dans toute la pièce.\".format(perso,tmpDf['speech_length'].sum()))" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -1352,6 +1323,167 @@ "Nous avons donc corriger le code pour prendre en compte ces deux problèmes." ] }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
perso
speech_length
0Dame Claude
0son clerc
43Brindavoine
47La Merluche
197Maitre Simon
294Le commissaire
517Anselme
910Mariane
1067Élise
1520La Flèche
1668Maitre Jacques
2370Frosine
2740Valère
3331Cléante
5753Harpagon
\n", + "
" + ], + "text/plain": [ + " perso\n", + "speech_length \n", + "0 Dame Claude\n", + "0 son clerc\n", + "43 Brindavoine\n", + "47 La Merluche\n", + "197 Maitre Simon\n", + "294 Le commissaire\n", + "517 Anselme\n", + "910 Mariane\n", + "1067 Élise\n", + "1520 La Flèche\n", + "1668 Maitre Jacques\n", + "2370 Frosine\n", + "2740 Valère\n", + "3331 Cléante\n", + "5753 Harpagon" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# On va parcourir le dictionnaire des personnages\n", + "# et compter le nombre total de mots prononcés\n", + "# pour chacun.\n", + "import numpy as np\n", + "persoList = []\n", + "persoNbWords = np.zeros(len(avarePersoDict),dtype=int)\n", + "#print(textDataSynthesisTableDf.shape)\n", + "for index, perso in enumerate(avarePersoDict):\n", + " persoRegex = perso\n", + " if perso.startswith(\"Maitre\"):\n", + " persoRegex = perso.replace('Maitre','Maître')\n", + " m = textDataSynthesisTableDf['author'].str.match(persoRegex, case=False, na=False)\n", + " tmpDf = textDataSynthesisTableDf[m].copy()\n", + " persoList.append(perso)\n", + " persoNbWords[index] = tmpDf['speech_length'].sum()\n", + "\n", + "data = pd.DataFrame({\"perso\":persoList, \"speech_length\":persoNbWords})\n", + " \n", + "sortedData = data.set_index('speech_length').sort_index()\n", + "sortedData\n", + " \n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Nous pouvons désormais finir de répondre à la premiè question posée. Le personnage s'exprimant le plus dans la pièce de _L'Avare_ de Molière et qui est donc le protagoniste principal, n'est autre qu'**Harpagon**. Quant aux personnages ne s'exprimant pas, nous en avons deux, le **clerc** du commissaire ainsi que **Dame Claude** qui est la servante d'Harpagon." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "On va utiliser le classement pour associer à chaque personnage de la pièce une couleur unique. On va donc reprendre le dictionnaire des personnages et régler la propriété de couleur." + ] + }, { "cell_type": "code", "execution_count": null, @@ -1359,6 +1491,71 @@ "outputs": [], "source": [] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Réponse à la seconde question" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "En nous appuyant sur une page internet trouvée sur exemple d'affichage similaire à ce qui est fait dans l'étude \\(à savoir un affichage par Acte, sous forme de barres horizontales pour chaque scène le composant donnant la répartition de la paroles entre les différents protagonistes de chaque scène\\), on va pouvoir répondre à la seconde question. Voici un lien vers celle-ci: [geeksforgeeks_stacked-percentage-bar-plot](https://www.geeksforgeeks.org/stacked-percentage-bar-plot-in-matplotlib/)." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "5\n" + ] + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "# Il va nous falloir compter le nombre d'actes de la pièce\n", + "# et faire une boucle sur ces derniers pour construire \n", + "# un dataframe par acte à partir du dataframe global\n", + "# que dont on va caler la structure en fonction de\n", + "# l'exemple disponible sur la page.\n", + "# Le dataframe doit contenir une ligne par scène de l'acte\n", + "# courant et une colonne par protagoniste de la scène\n", + "actNums = pd.unique(textDataSynthesisTableDf[\"act\"])\n", + "nbActs = actNums.size\n", + "\n", + "fig = plt.subplots(nbActs, 1)\n", + "\n", + "for actNum in actNums:\n", + " actDf = textDataSynthesisTableDf[textDataSynthesisTableDf==actNum].copy()\n", + " sceneNums = pd.unique(actDf[\"scene\"])\n", + " actPersos = pd.unique(actDf[\"author\"])\n", + " actDict = {\"scene\":[]}\n", + " for perso in actPersos:\n", + " # Création d'une colonne par personnage de l'acte\n", + " actDict[perso] = []\n", + " \n", + " for sceneNum in sceneNums:\n", + " actDict[\"scene\"].append(sceneNum)\n", + " sceneDf = actDf[actDf[\"scene\"]==sceneNum].copy()\n", + " scenePersos = pd.unique(sceneDf)\n", + " for perso in actPersos:\n", + " tmpDf = sceneDf[sceneDf[\"author\"]==perso].copy()\n", + " if tmpDf:\n", + " currentPersoNbWordsInScene = tmpDf[\"speech_length\"].sum()\n", + " actDict[perso] = currentPersoNbWordsInScene\n", + " else:\n", + " actDict[perso] = 0\n", + " pass" + ] + }, { "cell_type": "markdown", "metadata": {}, -- 2.18.1