no commit message

parent 55a7d0ea
......@@ -2517,59 +2517,46 @@
}
],
"source": [
"%matplotlib inline\n",
"import matplotlib.pyplot as plt\n",
"import pandas as pd\n",
"import isoweek\n",
"import os\n",
"import urllib\n",
"import urllib.request\n",
"import pandas as pd\n",
"\n",
"data_file = \"incidence-PAY-3.csv\"\n",
"if not os.path.isfile(data_file):\n",
" url = \"http://www.sentiweb.fr/datasets/incidence-PAY-3.csv\"\n",
" urllib.request.urlretrieve(url, data_file)\n",
"# Define the URL for the data\n",
"url = 'https://raw.githubusercontent.com/MicrosoftDocs/mslearn-introduction-to-machine-learning/main/Data/ml-basics/crime.csv'\n",
"\n",
"raw_data = pd.read_csv(data_file, skiprows=1)\n",
"raw_data[raw_data.isnull().any(axis=1)]\n",
"# Define the local filename\n",
"local_data_file = 'crime.csv'\n",
"\n",
"data = raw_data.dropna().copy()\n",
"# If the local file doesn't exist, download the data and save to the local file\n",
"if not os.path.exists(local_data_file):\n",
" urllib.request.urlretrieve(url, local_data_file)\n",
"\n",
"def convert_week(year_and_week_int):\n",
" year_and_week_str = str(year_and_week_int)\n",
" year = int(year_and_week_str[:4])\n",
" week = int(year_and_week_str[4:])\n",
" w = isoweek.Week(year, week)\n",
" return pd.Period(w.day(0), 'W')\n",
"# Read the data from the local file into a Pandas DataFrame\n",
"raw_data = pd.read_csv(local_data_file)\n",
"\n",
"data['period'] = [convert_week(yw) for yw in data['week']]\n",
"sorted_data = data.set_index('period').sort_index()\n",
"periods = sorted_data.index\n",
"# Drop the first column, which contains the crime codes\n",
"data = raw_data.drop(columns=['Offence Code'])\n",
"\n",
"for p1, p2 in zip(periods[:-1], periods[1:]):\n",
" delta = p2.to_timestamp() - p1.end_time\n",
" if delta > pd.Timedelta('1s'):\n",
" print(p1, p2)\n",
"# Convert the 'Reported Date' column to a Pandas datetime type\n",
"data['Reported Date'] = pd.to_datetime(data['Reported Date'], format='%d/%m/%Y')\n",
"\n",
"sorted_data['inc'].plot()\n",
"sorted_data['inc'][-200:].plot()\n",
"first_august_week = [pd.Period(pd.Timestamp(y, 8, 1), 'W')\n",
" for y in range(1985,\n",
" sorted_data.index[-1].year)]\n",
"year = []\n",
"yearly_incidence = []\n",
"# Filter the data to include only the weeks of interest\n",
"data = data[(data['Reported Date'] >= '1989-05-01') & (data['Reported Date'] <= '1989-05-07') | \n",
" (data['Reported Date'] >= '1989-05-15') & (data['Reported Date'] <= '1989-05-21')]\n",
"\n",
"for week1, week2 in zip(first_august_week[:-1],\n",
" first_august_week[1:]):\n",
" one_year = sorted_data['inc'][week1:week2-1]\n",
" assert abs(len(one_year)-52) < 2\n",
" yearly_incidence.append(one_year.sum())\n",
" year.append(week2.year)\n",
" \n",
"yearly_incidence = pd.Series(data=yearly_incidence, index=year)\n",
"# Compute the weekly crime incidence\n",
"weekly_incidence = data.groupby(pd.Grouper(key='Reported Date', freq='W')).size()\n",
"\n",
"# Compute the yearly crime incidence\n",
"yearly_incidence = weekly_incidence.groupby(weekly_incidence.index.year).sum()\n",
"year = yearly_incidence.index.astype(str)\n",
"yearly_incidence.index = year\n",
"\n",
"# Plot the yearly crime incidence\n",
"yearly_incidence.plot(style='*')\n",
"yearly_incidence.sort_values()\n",
"yearly_incidence.hist(xrot=20)\n"
"yearly_incidence.hist(xrot=20)\n",
"\n"
]
},
{
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment