diff --git a/module3/exo1/influenza-like-illness-analysis.ipynb b/module3/exo1/influenza-like-illness-analysis.ipynb index 0f4a8df91fb2e23b8f0bd9baf5a915cdc423b74c..77da6249a77b9ee71b2c87436cc0d52c15b9ef3e 100644 --- a/module3/exo1/influenza-like-illness-analysis.ipynb +++ b/module3/exo1/influenza-like-illness-analysis.ipynb @@ -2484,7 +2484,54 @@ "import pandas as pd\n", "import isoweek\n", "import os\n", - "import requests\n" + "import urllib\n", + "\n", + "data_file = \"incidence-PAY-3.csv\"\n", + "if not os.path.isfile(data_file):\n", + " url = \"http://www.sentiweb.fr/datasets/incidence-PAY-3.csv\"\n", + " urllib.request.urlretrieve(url, data_file)\n", + "\n", + "raw_data = pd.read_csv(data_file, skiprows=1)\n", + "raw_data[raw_data.isnull().any(axis=1)]\n", + "\n", + "data = raw_data.dropna().copy()\n", + "\n", + "def convert_week(year_and_week_int):\n", + " year_and_week_str = str(year_and_week_int)\n", + " year = int(year_and_week_str[:4])\n", + " week = int(year_and_week_str[4:])\n", + " w = isoweek.Week(year, week)\n", + " return pd.Period(w.day(0), 'W')\n", + "\n", + "data['period'] = [convert_week(yw) for yw in data['week']]\n", + "sorted_data = data.set_index('period').sort_index()\n", + "periods = sorted_data.index\n", + "\n", + "for p1, p2 in zip(periods[:-1], periods[1:]):\n", + " delta = p2.to_timestamp() - p1.end_time\n", + " if delta > pd.Timedelta('1s'):\n", + " print(p1, p2)\n", + "\n", + "sorted_data['inc'].plot()\n", + "sorted_data['inc'][-200:].plot()\n", + "first_august_week = [pd.Period(pd.Timestamp(y, 8, 1), 'W')\n", + " for y in range(1985,\n", + " sorted_data.index[-1].year)]\n", + "year = []\n", + "yearly_incidence = []\n", + "\n", + "for week1, week2 in zip(first_august_week[:-1],\n", + " first_august_week[1:]):\n", + " one_year = sorted_data['inc'][week1:week2-1]\n", + " assert abs(len(one_year)-52) < 2\n", + " yearly_incidence.append(one_year.sum())\n", + " year.append(week2.year)\n", + " \n", + "yearly_incidence = pd.Series(data=yearly_incidence, index=year)\n", + "\n", + "yearly_incidence.plot(style='*')\n", + "yearly_incidence.sort_values()\n", + "yearly_incidence.hist(xrot=20)\n" ] }, { @@ -2493,11 +2540,7 @@ "metadata": {}, "outputs": [], "source": [ - "# Define the URL of the remote CSV file\n", - "remote_data_url = \"http://www.sentiweb.fr/datasets/incidence-PAY-3.csv\"\n", - "\n", - "# Define the name of the local file\n", - "local_data_file = \"incidence-PAY-3.csv\"\n" + "\n" ] }, { @@ -2518,18 +2561,7 @@ } ], "source": [ - "# Check if the local file exists\n", - "if not os.path.isfile(local_data_file):\n", - " # If the local file does not exist, download the data from the remote URL\n", - " response = requests.get(remote_data_url)\n", - " with open(local_data_file, \"w\") as f:\n", - " f.write(response.text)\n", - "\n", - "# Read the data from the local file into a Pandas DataFrame\n", - "raw_data = pd.read_csv(local_data_file, skiprows=1)\n", "\n", - "# Remove rows with missing values\n", - "data = raw_data.dropna().copy()\n", "\n" ] }, @@ -2539,50 +2571,7 @@ "metadata": {}, "outputs": [], "source": [ - "# Define a function to convert year and week integers to Pandas Period objects\n", - "def convert_week(year_and_week_int):\n", - " year_and_week_str = str(year_and_week_int)\n", - " year = int(year_and_week_str[:4])\n", - " week = int(year_and_week_str[4:])\n", - " w = isoweek.Week(year, week)\n", - " return pd.Period(w.day(0), 'W')\n", - "\n", - "# Add a column to the DataFrame containing the Period objects\n", - "data['period'] = [convert_week(yw) for yw in data['week']]\n", - "\n", - "# Sort the data by the period column\n", - "sorted_data = data.set_index('period').sort_index()\n", - "\n", - "# Check for gaps in the data and print any that are found\n", - "periods = sorted_data.index\n", - "for p1, p2 in zip(periods[:-1], periods[1:]):\n", - " delta = p2.to_timestamp() - p1.end_time\n", - " if delta > pd.Timedelta('1s'):\n", - " print(p1, p2)\n", - "\n", - "# Plot the incidence data over time\n", - "sorted_data['inc'].plot()\n", - "\n", - "# Plot the last 200 data points\n", - "sorted_data['inc'][-200:].plot()\n", - "\n", - "# Compute the total incidence for each year and plot the results\n", - "first_august_week = [pd.Period(pd.Timestamp(y, 8, 1), 'W')\n", - " for y in range(1985,\n", - " sorted_data.index[-1].year)]\n", - "year = []\n", - "yearly_incidence = []\n", - "for week1, week2 in zip(first_august_week[:-1],\n", - " first_august_week[1:]):\n", - " one_year = sorted_data['inc'][week1:week2-1]\n", - " assert abs(len(one_year)-52) < 2\n", - " yearly_incidence.append(one_year.sum())\n", - " year.append(week2.year)\n", - "yearly_incidence = pd.Series(data=yearly_incidence, index=year)\n", - "\n", - "yearly_incidence.plot(style='*')\n", - "yearly_incidence.sort_values()\n", - "yearly_incidence.hist(xrot=20)\n" + "\n" ] } ],