missing data analysis

parent 40551ad4
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
"source": [ "source": [
"# Oscillation périodique\n", "# Oscillation périodique\n",
"\n", "\n",
"La première étape est de charger les données du fichier et de s'assurer de son contenu" "La première étape est de charger les données du fichier et de s'assurer de son contenu. D'après les commentaires en en-tête de fichier, la première colonne indique la date et la seconde la concentration de CO2 en micro-moles par mole (ppm). Les mesures sont alignées sur 12h00 sur le premier jour de la semaine."
] ]
}, },
{ {
...@@ -32,34 +32,88 @@ ...@@ -32,34 +32,88 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 4, "execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# Data taken from\n",
"# https://scrippsco2.ucsd.edu/assets/data/atmospheric/stations/in_situ_co2/weekly/weekly_in_situ_co2_mlo.csv\n",
"\n",
"path = \"weekly_in_situ_co2_mlo.csv\"\n",
"raw_data = pd.read_csv(path, comment='\"', names=[\"Date\", \"Concentration\"])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Vérifions les semaines manquantes dans le dataset."
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"ename": "FileNotFoundError", "name": "stdout",
"evalue": "File b'weekly_in_situ_co2_mlo.csv' does not exist", "output_type": "stream",
"output_type": "error", "text": [
"traceback": [ "Missing weeks (74 in total):\n",
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "1958-05-10,\t1958-05-31,\t1958-06-07,\t1958-06-14,\t1958-06-21,\t1958-06-28,\t1958-08-23,\t1958-09-13,\t1958-09-20,\t1958-09-27,\t1958-10-04,\t1958-10-11,\t1958-10-18,\t1958-10-25,\t1958-11-01,\t1959-02-07,\t1959-03-14,\t1959-05-30,\t1959-08-15,\t1962-08-25,\t1962-09-01,\t1962-09-08,\t1962-12-29,\t1963-02-16,\t1963-05-04,\t1963-11-23,\t1964-01-25,\t1964-02-01,\t1964-02-08,\t1964-02-15,\t1964-02-22,\t1964-02-29,\t1964-03-07,\t1964-03-14,\t1964-03-21,\t1964-03-28,\t1964-04-04,\t1964-04-11,\t1964-04-18,\t1964-04-25,\t1964-05-02,\t1964-05-09,\t1964-05-16,\t1964-05-23,\t1964-06-13,\t1964-06-20,\t1964-08-08,\t1966-07-16,\t1966-07-23,\t1966-07-30,\t1966-11-05,\t1967-01-21,\t1967-01-28,\t1976-06-26,\t1984-03-31,\t1984-04-07,\t1984-04-14,\t1984-04-21,\t1985-08-03,\t2003-06-14,\t2003-10-11,\t2003-10-18,\t2005-02-26,\t2005-03-05,\t2005-03-12,\t2005-03-19,\t2006-02-11,\t2006-02-18,\t2007-01-27,\t2012-10-06,\t2012-10-13,\t2020-01-18,\t2022-12-03,\t2022-12-10\n",
"\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", "\n",
"\u001b[0;32m<ipython-input-4-a84f6f1aff3f>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mpath\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"weekly_in_situ_co2_mlo.csv\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mraw_data\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "Missing weeks per year:\n",
"\u001b[0;32m/opt/conda/lib/python3.6/site-packages/pandas/io/parsers.py\u001b[0m in \u001b[0;36mparser_f\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skipfooter, skip_footer, doublequote, delim_whitespace, as_recarray, compact_ints, use_unsigned, low_memory, buffer_lines, memory_map, float_precision)\u001b[0m\n\u001b[1;32m 707\u001b[0m skip_blank_lines=skip_blank_lines)\n\u001b[1;32m 708\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 709\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0m_read\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 710\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 711\u001b[0m \u001b[0mparser_f\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__name__\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "1958 15\n",
"\u001b[0;32m/opt/conda/lib/python3.6/site-packages/pandas/io/parsers.py\u001b[0m in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m 447\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 448\u001b[0m \u001b[0;31m# Create the parser.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 449\u001b[0;31m \u001b[0mparser\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mTextFileReader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 450\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 451\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mchunksize\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0miterator\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "1959 4\n",
"\u001b[0;32m/opt/conda/lib/python3.6/site-packages/pandas/io/parsers.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[1;32m 816\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moptions\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'has_index_names'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'has_index_names'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 817\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 818\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_make_engine\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mengine\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 819\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 820\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mclose\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "1962 4\n",
"\u001b[0;32m/opt/conda/lib/python3.6/site-packages/pandas/io/parsers.py\u001b[0m in \u001b[0;36m_make_engine\u001b[0;34m(self, engine)\u001b[0m\n\u001b[1;32m 1047\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_make_engine\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mengine\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'c'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1048\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mengine\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'c'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1049\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mCParserWrapper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moptions\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1050\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1051\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mengine\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'python'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "1963 3\n",
"\u001b[0;32m/opt/conda/lib/python3.6/site-packages/pandas/io/parsers.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, src, **kwds)\u001b[0m\n\u001b[1;32m 1693\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'allow_leading_cols'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex_col\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1694\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1695\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_reader\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mparsers\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mTextReader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msrc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1696\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1697\u001b[0m \u001b[0;31m# XXX\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "1964 21\n",
"\u001b[0;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader.__cinit__\u001b[0;34m()\u001b[0m\n", "1966 4\n",
"\u001b[0;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._setup_parser_source\u001b[0;34m()\u001b[0m\n", "1967 2\n",
"\u001b[0;31mFileNotFoundError\u001b[0m: File b'weekly_in_situ_co2_mlo.csv' does not exist" "1976 1\n",
"1984 4\n",
"1985 1\n",
"2003 3\n",
"2005 4\n",
"2006 2\n",
"2007 1\n",
"2012 2\n",
"2020 1\n",
"2022 2\n"
] ]
} }
], ],
"source": [ "source": [
"# Data taken from\n", "# Check if every line is the corresponding week\n",
"# https://scrippsco2.ucsd.edu/assets/data/atmospheric/stations/in_situ_co2/weekly/weekly_in_situ_co2_mlo.csv\n", "expected_date = set()\n",
"w = 0\n",
"while True:\n",
" new_week = str(isoweek.Week(1958, 13+w).saturday())\n",
" if new_week >= '2024-01-01':\n",
" break\n",
" expected_date.add(new_week)\n",
" w += 1\n",
"\n", "\n",
"path = \"weekly_in_situ_co2_mlo.csv\"\n", "# Remove line if found\n",
"raw_data = pd.read_csv(path)\n" "for w in raw_data.index:\n",
" stored_date = raw_data[\"Date\"][w]\n",
" expected_date.remove(stored_date)\n",
"\n",
"missing_weeks = sorted(expected_date)\n",
"print(f\"Missing weeks ({len(missing_weeks)} in total):\")\n",
"print(\",\\t\".join(missing_weeks))\n",
"\n",
"missing_weeks_keep_year = [e[:4] for e in missing_weeks]\n",
"missing_weeks_per_year = dict()\n",
"for e in missing_weeks_keep_year:\n",
" if e in missing_weeks_per_year.keys():\n",
" missing_weeks_per_year[e] += 1\n",
" else:\n",
" missing_weeks_per_year[e] = 1\n",
" \n",
"print(f\"\\nMissing weeks per year:\")\n",
"for y in sorted(missing_weeks_per_year):\n",
" print(y, missing_weeks_per_year[y])"
] ]
} }
], ],
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment