diff --git a/module3/exo2/Analyse de des dialogues de l'avare de Moliere.ipynb b/module3/exo2/Analyse de des dialogues de l'avare de Moliere.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..3835f130f4207b77389c1292d18b5ede780fe939 --- /dev/null +++ b/module3/exo2/Analyse de des dialogues de l'avare de Moliere.ipynb @@ -0,0 +1,139 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "hideCode": true, + "hidePrompt": true + }, + "source": [ + "# Etape 1 : classer les personnages selon le nombre de mots prononcés" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "hideCode": true, + "hidePrompt": true + }, + "source": [ + "Tout d'abord, il faut commencer par inclure les bibliothèques dont on aura besoin." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "from collections import Counter" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "ename": "FileNotFoundError", + "evalue": "[Errno 2] No such file or directory: 'l_avare.md'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 36\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 37\u001b[0m \u001b[0;31m# Chargement et traitement du texte au format Markdown\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 38\u001b[0;31m \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"l_avare.md\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"r\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mencoding\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"utf-8\"\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mfile\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 39\u001b[0m \u001b[0mtext\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfile\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 40\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'l_avare.md'" + ] + } + ], + "source": [ + "def extract_dialogues_markdown(text):\n", + " # Séparation des scènes\n", + " scenes = re.split(r'##\\s*SCENE\\s+\\d+', text) # Divise le texte par les scènes\n", + " dialogue_data = []\n", + "\n", + " for scene_id, scene in enumerate(scenes):\n", + " lines = scene.split('\\n')\n", + " current_character = None\n", + " dialogue = \"\"\n", + " \n", + " for line in lines:\n", + " # Utilisation de regex pour détecter les noms de personnages en majuscules suivis de ':'\n", + " match = re.match(r'^([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]+):\\s*(.*)', line)\n", + " \n", + " if match:\n", + " # Si un nom est détecté, sauvegardons la réplique précédente\n", + " if current_character and dialogue:\n", + " word_count = len(dialogue.split())\n", + " dialogue_data.append([scene_id, current_character, word_count])\n", + " \n", + " # Actualisation du personnage actuel et début de nouvelle réplique\n", + " current_character = match.group(1).lower() # Nom en minuscule pour uniformité\n", + " dialogue = match.group(2) # Commence une nouvelle réplique\n", + "\n", + " elif current_character:\n", + " # Continuation de la réplique sur plusieurs lignes\n", + " dialogue += ' ' + line.strip()\n", + "\n", + " # Enregistrement de la dernière réplique de la scène\n", + " if current_character and dialogue:\n", + " word_count = len(dialogue.split())\n", + " dialogue_data.append([scene_id, current_character, word_count])\n", + "\n", + " # Conversion en DataFrame\n", + " return pd.DataFrame(dialogue_data, columns=[\"scene_id\", \"character\", \"word_count\"])\n", + "\n", + "# Chargement et traitement du texte au format Markdown\n", + "with open(\"l_avare.md\", \"r\", encoding=\"utf-8\") as file:\n", + " text = file.read()\n", + "\n", + "# Extraction des dialogues\n", + "dialogue_df = extract_dialogues_markdown(text)\n", + "\n", + "# Analyse : nombre de mots par scène et personnage\n", + "scene_word_counts = dialogue_df.groupby([\"scene_id\", \"character\"])[\"word_count\"].sum().unstack(fill_value=0)\n", + "scene_totals = scene_word_counts.sum(axis=1)\n", + "\n", + "# Création du graphique empilé\n", + "scene_word_counts.div(scene_totals, axis=0).plot(kind='bar', stacked=True, colormap='tab20', figsize=(12, 8))\n", + "plt.xlabel(\"Scène\")\n", + "plt.ylabel(\"Proportion de mots\")\n", + "plt.title(\"Répartition de la parole par personnage dans chaque scène\")\n", + "plt.legend(title=\"Personnage\", bbox_to_anchor=(1.05, 1), loc='upper left')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "hide_code_all_hidden": true, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}