no commit message

parent 25eb9b3a
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"hideCode": true,
"hidePrompt": true
},
"source": [
"# Etape 1 : classer les personnages selon le nombre de mots prononcés"
]
},
{
"cell_type": "markdown",
"metadata": {
"hideCode": true,
"hidePrompt": true
},
"source": [
"Tout d'abord, il faut commencer par inclure les bibliothèques dont on aura besoin."
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"from collections import Counter"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"ename": "FileNotFoundError",
"evalue": "[Errno 2] No such file or directory: 'l_avare.md'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-4-36d7bdb3cdc6>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 36\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 37\u001b[0m \u001b[0;31m# Chargement et traitement du texte au format Markdown\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 38\u001b[0;31m \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"l_avare.md\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"r\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mencoding\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"utf-8\"\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mfile\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 39\u001b[0m \u001b[0mtext\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfile\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 40\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'l_avare.md'"
]
}
],
"source": [
"def extract_dialogues_markdown(text):\n",
" # Séparation des scènes\n",
" scenes = re.split(r'##\\s*SCENE\\s+\\d+', text) # Divise le texte par les scènes\n",
" dialogue_data = []\n",
"\n",
" for scene_id, scene in enumerate(scenes):\n",
" lines = scene.split('\\n')\n",
" current_character = None\n",
" dialogue = \"\"\n",
" \n",
" for line in lines:\n",
" # Utilisation de regex pour détecter les noms de personnages en majuscules suivis de ':'\n",
" match = re.match(r'^([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]+):\\s*(.*)', line)\n",
" \n",
" if match:\n",
" # Si un nom est détecté, sauvegardons la réplique précédente\n",
" if current_character and dialogue:\n",
" word_count = len(dialogue.split())\n",
" dialogue_data.append([scene_id, current_character, word_count])\n",
" \n",
" # Actualisation du personnage actuel et début de nouvelle réplique\n",
" current_character = match.group(1).lower() # Nom en minuscule pour uniformité\n",
" dialogue = match.group(2) # Commence une nouvelle réplique\n",
"\n",
" elif current_character:\n",
" # Continuation de la réplique sur plusieurs lignes\n",
" dialogue += ' ' + line.strip()\n",
"\n",
" # Enregistrement de la dernière réplique de la scène\n",
" if current_character and dialogue:\n",
" word_count = len(dialogue.split())\n",
" dialogue_data.append([scene_id, current_character, word_count])\n",
"\n",
" # Conversion en DataFrame\n",
" return pd.DataFrame(dialogue_data, columns=[\"scene_id\", \"character\", \"word_count\"])\n",
"\n",
"# Chargement et traitement du texte au format Markdown\n",
"with open(\"l_avare.md\", \"r\", encoding=\"utf-8\") as file:\n",
" text = file.read()\n",
"\n",
"# Extraction des dialogues\n",
"dialogue_df = extract_dialogues_markdown(text)\n",
"\n",
"# Analyse : nombre de mots par scène et personnage\n",
"scene_word_counts = dialogue_df.groupby([\"scene_id\", \"character\"])[\"word_count\"].sum().unstack(fill_value=0)\n",
"scene_totals = scene_word_counts.sum(axis=1)\n",
"\n",
"# Création du graphique empilé\n",
"scene_word_counts.div(scene_totals, axis=0).plot(kind='bar', stacked=True, colormap='tab20', figsize=(12, 8))\n",
"plt.xlabel(\"Scène\")\n",
"plt.ylabel(\"Proportion de mots\")\n",
"plt.title(\"Répartition de la parole par personnage dans chaque scène\")\n",
"plt.legend(title=\"Personnage\", bbox_to_anchor=(1.05, 1), loc='upper left')\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"hide_code_all_hidden": true,
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.4"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"hideCode": true,
"hidePrompt": true
},
"source": [
"# titre"
]
},
{
"cell_type": "markdown",
"metadata": {
"hideCode": true,
"hidePrompt": true
},
"source": [
"Tout d'abord, il faut commencer par inclure les bibliothèques dont on aura besoin."
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"%matplotlib inline\n",
"import matplotlib.pyplot as plt\n",
"import pandas as pd\n",
"import isoweek"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"ename": "AttributeError",
"evalue": "'Index' object has no attribute 'labels'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-12-d45da91e46f1>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 50\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 51\u001b[0m \u001b[0;31m# Comptage des mots par personnage et par scène\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 52\u001b[0;31m \u001b[0mscene_word_counts\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdialogue_df\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgroupby\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"scene_id\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"character\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"word_count\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0munstack\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfill_value\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 53\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 54\u001b[0m \u001b[0;31m# Vérification que les données sont bien formatées avant le graphique\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/opt/conda/lib/python3.6/site-packages/pandas/core/series.py\u001b[0m in \u001b[0;36munstack\u001b[0;34m(self, level, fill_value)\u001b[0m\n\u001b[1;32m 2222\u001b[0m \"\"\"\n\u001b[1;32m 2223\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mpandas\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcore\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreshape\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreshape\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0munstack\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2224\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0munstack\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlevel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfill_value\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2225\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2226\u001b[0m \u001b[0;31m# ----------------------------------------------------------------------\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/opt/conda/lib/python3.6/site-packages/pandas/core/reshape/reshape.py\u001b[0m in \u001b[0;36munstack\u001b[0;34m(obj, level, fill_value)\u001b[0m\n\u001b[1;32m 472\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 473\u001b[0m unstacker = _Unstacker(obj.values, obj.index, level=level,\n\u001b[0;32m--> 474\u001b[0;31m fill_value=fill_value)\n\u001b[0m\u001b[1;32m 475\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0munstacker\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_result\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 476\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/opt/conda/lib/python3.6/site-packages/pandas/core/reshape/reshape.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, values, index, level, value_columns, fill_value)\u001b[0m\n\u001b[1;32m 105\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 106\u001b[0m \u001b[0;31m# when index includes `nan`, need to lift levels/strides by 1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 107\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlift\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m1\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlabels\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlevel\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 108\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 109\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnew_index_levels\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlevels\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mAttributeError\u001b[0m: 'Index' object has no attribute 'labels'"
]
}
],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"hide_code_all_hidden": true,
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.4"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment