Commit 2e2ccaad authored by Adam Taheraly's avatar Adam Taheraly

reponse devoir

parent b0db00a7
......@@ -3,7 +3,7 @@
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" lang="fr" xml:lang="fr">
<head>
<!-- 2020-04-03 ven. 16:36 -->
<!-- 2020-04-05 dim. 01:48 -->
<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<title>Autour du Paradoxe de Simpson</title>
......@@ -240,187 +240,353 @@ for the JavaScript code in this tag.
<h2>Table des matières</h2>
<div id="text-table-of-contents">
<ul>
<li><a href="#org228e6c5">1. Chargement des librairies R</a></li>
<li><a href="#org2059b8b">2. Mise en place des données</a>
<li><a href="#org538b66e">1. Analyse des données</a>
<ul>
<li><a href="#orgd49b650">2.1. Chargement</a></li>
<li><a href="#orgd27fa49">2.2. Vérification</a></li>
</ul>
</li>
<li><a href="#org244e357">3. Analyse des données</a>
<ul>
<li><a href="#orgcb8b213">3.1. Question 1</a></li>
<li><a href="#org90c7a98">1.1. Question 1</a></li>
<li><a href="#org6ed7f60">1.2. Question 2</a></li>
<li><a href="#org63fe99e">1.3. Question 3</a></li>
</ul>
</li>
</ul>
</div>
</div>
<div id="outline-container-org228e6c5" class="outline-2">
<h2 id="org228e6c5"><span class="section-number-2">1</span> Chargement des librairies R</h2>
</div>
<div id="outline-container-org2059b8b" class="outline-2">
<h2 id="org2059b8b"><span class="section-number-2">2</span> Mise en place des données</h2>
<div class="outline-text-2" id="text-2">
<div id="outline-container-org538b66e" class="outline-2">
<h2 id="org538b66e"><span class="section-number-2">1</span> Analyse des données</h2>
<div class="outline-text-2" id="text-1">
</div>
<div id="outline-container-orgd49b650" class="outline-3">
<h3 id="orgd49b650"><span class="section-number-3">2.1</span> Chargement</h3>
<div class="outline-text-3" id="text-2-1">
<div id="outline-container-org90c7a98" class="outline-3">
<h3 id="org90c7a98"><span class="section-number-3">1.1</span> Question 1</h3>
<div class="outline-text-3" id="text-1-1">
<div class="org-src-container">
<pre class="src src-R">simpson_data <span style="color: #008b8b;">&lt;-</span> read.csv(<span style="color: #8b2252;">"Subject6_smoking.csv"</span>)
<pre class="src src-R">dead_or_alive <span style="color: #008b8b;">&lt;-</span> simpson_data<span style="color: #008b8b;">%&gt;%</span>
group_by(Smoker)<span style="color: #008b8b;">%&gt;%</span>
summarise(Number = n(), Alive = sum(Status == <span style="color: #8b2252;">"Alive"</span>), Dead = sum(Status == <span style="color: #8b2252;">"Dead"</span>), mortality = (sum(Status == <span style="color: #8b2252;">"Dead"</span>)/n())*100)
dead_or_alive$mortality <span style="color: #008b8b;">&lt;-</span> round(dead_or_alive$mortality, 2)
</pre>
</div>
<p>
Les données sont sous la forme :
</p>
<table border="2" cellspacing="0" cellpadding="6" rules="groups" frame="hsides">
<colgroup>
<col class="org-left" />
<col class="org-left" />
<col class="org-right" />
<col class="org-right" />
<col class="org-right" />
<col class="org-right" />
</colgroup>
<thead>
<tr>
<th scope="col" class="org-left">Libellé</th>
<th scope="col" class="org-left">Description</th>
<th scope="col" class="org-left">Smoker</th>
<th scope="col" class="org-right">Number</th>
<th scope="col" class="org-right">Alive</th>
<th scope="col" class="org-right">Dead</th>
<th scope="col" class="org-right">mortality</th>
</tr>
</thead>
<tbody>
<tr>
<td class="org-left">Smoker</td>
<td class="org-left">Fume (Yes) ou non (No)</td>
<td class="org-left">No</td>
<td class="org-right">732</td>
<td class="org-right">502</td>
<td class="org-right">230</td>
<td class="org-right">31.42</td>
</tr>
<tr>
<td class="org-left">Status</td>
<td class="org-left">En vie (Alive) ou mort (Dead)</td>
</tr>
<tr>
<td class="org-left">Age</td>
<td class="org-left">Age du sujet</td>
<td class="org-left">Yes</td>
<td class="org-right">582</td>
<td class="org-right">443</td>
<td class="org-right">139</td>
<td class="org-right">23.88</td>
</tr>
</tbody>
</table>
<div class="org-src-container">
<pre class="src src-R">mortality <span style="color: #008b8b;">&lt;-</span> ggplot(dead_or_alive, aes(x = Smoker, y = mortality, fill = Smoker)) + geom_bar(stat = <span style="color: #8b2252;">"identity"</span>) +
labs(x = <span style="color: #8b2252;">"Smoker"</span>, y = <span style="color: #8b2252;">"Mortality rate"</span>, title = <span style="color: #8b2252;">"Mortality rate by behaviour"</span>)
mortality
</pre>
</div>
<p>
<img src="file:///tmp/babel-zzxmyL/figurezRqW9s.png" alt="figurezRqW9s.png" />
Le résultat est surprenant car on observe un plus fort taux de
mortalité chez les non-fumeur que chez les fumeur.
</p>
</div>
</div>
<div id="outline-container-orgd27fa49" class="outline-3">
<h3 id="orgd27fa49"><span class="section-number-3">2.2</span> Vérification</h3>
<div class="outline-text-3" id="text-2-2">
<div id="outline-container-org6ed7f60" class="outline-3">
<h3 id="org6ed7f60"><span class="section-number-3">1.2</span> Question 2</h3>
<div class="outline-text-3" id="text-1-2">
<p>
Vérification du bon chargement des données en visualisant le début et
la fin des données.
On recrée un data-frame pour ne pas interagir avec les résultats
précédent.
</p>
<div class="org-src-container">
<pre class="src src-R">head(simpson_data)
tail(simpson_data)
<pre class="src src-R">simpson_data_q2 <span style="color: #008b8b;">&lt;-</span> simpson_data
simpson_data_q2$Age <span style="color: #008b8b;">&lt;-</span> as.numeric(simpson_data_q2$Age)
</pre>
</div>
<p>
On regroupe les ages en catégories
</p>
<div class="org-src-container">
<pre class="src src-R">simpson_data_q2$Age <span style="color: #008b8b;">&lt;-</span> cut(simpson_data_q2$Age, c(18, 34, 54, 64, 200), include.lowest = <span style="color: #228b22;">TRUE</span>, labels = c(<span style="color: #8b2252;">"18-34"</span>, <span style="color: #8b2252;">"35-54"</span>, <span style="color: #8b2252;">"55-64"</span>, <span style="color: #8b2252;">"plus de 65"</span>))
table(simpson_data_q2$Age)
</pre>
</div>
<pre class="example">
Smoker Status Age
1 Yes Alive 21.0
2 Yes Alive 19.3
3 No Dead 57.5
4 No Alive 47.1
5 Yes Alive 81.4
6 No Alive 36.8
Smoker Status Age
1309 No Alive 42.1
1310 Yes Alive 35.9
1311 No Alive 22.3
1312 Yes Dead 62.1
1313 No Dead 88.6
1314 No Alive 39.1
18-34 35-54 55-64 plus de 65
400 436 236 242
</pre>
<p>
Vérification de la présence de données manquante.
</p>
<div class="org-src-container">
<pre class="src src-R">na_records <span style="color: #008b8b;">&lt;-</span> apply(simpson_data, 1, <span style="color: #a020f0;">function</span>(x) any(is.na(x)))
simpson_data[na_records,]
<pre class="src src-R">dead_or_alive2 <span style="color: #008b8b;">&lt;-</span> simpson_data_q2<span style="color: #008b8b;">%&gt;%</span>
group_by(Smoker, Age)<span style="color: #008b8b;">%&gt;%</span>
summarise(Number = n(), Alive = sum(Status == <span style="color: #8b2252;">"Alive"</span>), Dead = sum(Status == <span style="color: #8b2252;">"Dead"</span>))<span style="color: #008b8b;">%&gt;%</span>
ungroup()<span style="color: #008b8b;">%&gt;%</span>
group_by(Smoker)<span style="color: #008b8b;">%&gt;%</span>
mutate(mortality = (Dead/sum(Number))*100)
dead_or_alive$mortality <span style="color: #008b8b;">&lt;-</span> round(dead_or_alive$mortality, 2)
</pre>
</div>
<pre class="example">
<table border="2" cellspacing="0" cellpadding="6" rules="groups" frame="hsides">
<colgroup>
<col class="org-left" />
<col class="org-right" />
<col class="org-right" />
<col class="org-right" />
<col class="org-right" />
<col class="org-right" />
</colgroup>
<thead>
<tr>
<th scope="col" class="org-left">Smoker</th>
<th scope="col" class="org-right">Age</th>
<th scope="col" class="org-right">Number</th>
<th scope="col" class="org-right">Alive</th>
<th scope="col" class="org-right">Dead</th>
<th scope="col" class="org-right">mortality</th>
</tr>
</thead>
<tbody>
<tr>
<td class="org-left">No</td>
<td class="org-right">18-34</td>
<td class="org-right">219</td>
<td class="org-right">213</td>
<td class="org-right">6</td>
<td class="org-right">0.819672131147541</td>
</tr>
<tr>
<td class="org-left">No</td>
<td class="org-right">35-54</td>
<td class="org-right">199</td>
<td class="org-right">180</td>
<td class="org-right">19</td>
<td class="org-right">2.59562841530055</td>
</tr>
<tr>
<td class="org-left">No</td>
<td class="org-right">55-64</td>
<td class="org-right">121</td>
<td class="org-right">81</td>
<td class="org-right">40</td>
<td class="org-right">5.46448087431694</td>
</tr>
<tr>
<td class="org-left">No</td>
<td class="org-right">plus de 65</td>
<td class="org-right">193</td>
<td class="org-right">28</td>
<td class="org-right">165</td>
<td class="org-right">22.5409836065574</td>
</tr>
[1] Smoker Status Age
&lt;0 lignes&gt; (ou 'row.names' de longueur nulle)
<tr>
<td class="org-left">Yes</td>
<td class="org-right">18-34</td>
<td class="org-right">181</td>
<td class="org-right">176</td>
<td class="org-right">5</td>
<td class="org-right">0.859106529209622</td>
</tr>
<tr>
<td class="org-left">Yes</td>
<td class="org-right">35-54</td>
<td class="org-right">237</td>
<td class="org-right">196</td>
<td class="org-right">41</td>
<td class="org-right">7.0446735395189</td>
</tr>
<tr>
<td class="org-left">Yes</td>
<td class="org-right">55-64</td>
<td class="org-right">115</td>
<td class="org-right">64</td>
<td class="org-right">51</td>
<td class="org-right">8.76288659793814</td>
</tr>
<tr>
<td class="org-left">Yes</td>
<td class="org-right">plus de 65</td>
<td class="org-right">49</td>
<td class="org-right">7</td>
<td class="org-right">42</td>
<td class="org-right">7.21649484536082</td>
</tr>
</tbody>
</table>
<div class="org-src-container">
<pre class="src src-R">mortality2 <span style="color: #008b8b;">&lt;-</span> ggplot(dead_or_alive2, aes(x = Smoker, y = mortality, fill = Age))+ geom_bar(stat = <span style="color: #8b2252;">"identity"</span>, position = position_dodge()) +
labs(x = <span style="color: #8b2252;">"Smoker"</span>, y = <span style="color: #8b2252;">"Mortality rate"</span>, title = <span style="color: #8b2252;">"Mortality rate by behaviour"</span>)
mortality2
</pre>
</div>
<div class="figure">
<p><img src="file:///tmp/babel-zzxmyL/figureQLQAz5.png" alt="figureQLQAz5.png" />
</p>
</div>
<p>
Vérification de la classe des données et du nombre de data.
On observe une diminution du taux de mortalité chez les moins de 65
ans parmis les non fumeur par rapport au fumeur. Cependant, chez les
plus de 65 ans, la tendence s'inverse avec un plus grande mortalité
chez les fumeurs par rappor au non fumeur (3 fois plus).
</p>
<p>
Cela peut s'expliquer par le fait que le corps des fumeurs se soit
habituer aux toxines alors que les non-fumeur subissant le tabagisme
passif et ayant atteint un age où ils sont plus à risque soit plus à
même de décédé à cause des toxines du tabac.
</p>
</div>
</div>
<div id="outline-container-org63fe99e" class="outline-3">
<h3 id="org63fe99e"><span class="section-number-3">1.3</span> Question 3</h3>
<div class="outline-text-3" id="text-1-3">
<p>
On recrée un data-frame pour ne pas interagir avec les résultats
précédent.
</p>
<div class="org-src-container">
<pre class="src src-R">simpson_data_q3 <span style="color: #008b8b;">&lt;-</span> simpson_data
simpson_data_q3$Death <span style="color: #008b8b;">&lt;-</span> factor(simpson_data_q3$Status)
simpson_data_q3$Death <span style="color: #008b8b;">&lt;-</span> fct_recode(simpson_data_q3$Death, <span style="color: #8b2252;">"1"</span>=<span style="color: #8b2252;">"Alive"</span>, <span style="color: #8b2252;">"0"</span>=<span style="color: #8b2252;">"Dead"</span>)
simpson_data_q3$Death <span style="color: #008b8b;">&lt;-</span> as.numeric(as.character(simpson_data_q3$Death))
simpson_data_q3$Age <span style="color: #008b8b;">&lt;-</span> as.numeric(as.character(simpson_data_q3$Age))
</pre>
</div>
<div class="org-src-container">
<pre class="src src-R">nrow(simpson_data)
class(simpson_data$Smoker)
class(simpson_data$Status)
class(simpson_data$Age)
<pre class="src src-R">head(simpson_data_q3)
</pre>
</div>
<pre class="example">
[1] 1314
Smoker Status Age Death
1 Yes Alive 21.0 1
2 Yes Alive 19.3 1
3 No Dead 57.5 0
4 No Alive 47.1 1
5 Yes Alive 81.4 1
6 No Alive 36.8 1
[1] "factor"
</pre>
[1] "factor"
<div class="org-src-container">
<pre class="src src-R">reg_log_total <span style="color: #008b8b;">&lt;-</span> ggplot(simpson_data_q3, aes(x=Age,y=Death)) + geom_point(alpha=.3,size=3) +
theme_bw() +
geom_smooth(method = <span style="color: #8b2252;">"glm"</span>,
method.args = list(family = <span style="color: #8b2252;">"binomial"</span>),fullrange = <span style="color: #228b22;">TRUE</span>)
[1] "numeric"
reg_log_fumeur <span style="color: #008b8b;">&lt;-</span> ggplot(simpson_data_q3[simpson_data_q3$Smoker == <span style="color: #8b2252;">"Yes"</span>,], aes(x=Age,y=Death)) + geom_point(alpha=.3,size=3) +
theme_bw() +
geom_smooth(method = <span style="color: #8b2252;">"glm"</span>,
method.args = list(family = <span style="color: #8b2252;">"binomial"</span>),fullrange = <span style="color: #228b22;">TRUE</span>)
reg_log_non_fumeur <span style="color: #008b8b;">&lt;-</span> ggplot(simpson_data_q3[simpson_data_q3$Smoker == <span style="color: #8b2252;">"No"</span>,], aes(x=Age,y=Death)) + geom_point(alpha=.3,size=3) +
theme_bw() +
geom_smooth(method = <span style="color: #8b2252;">"glm"</span>,
method.args = list(family = <span style="color: #8b2252;">"binomial"</span>),fullrange = <span style="color: #228b22;">TRUE</span>)
</pre>
</div>
<div class="org-src-container">
<pre class="src src-R">reg_log
</pre>
</div>
</div>
<div id="outline-container-org244e357" class="outline-2">
<h2 id="org244e357"><span class="section-number-2">3</span> Analyse des données</h2>
<div class="outline-text-2" id="text-3">
<div class="figure">
<p><img src="file:///tmp/babel-zzxmyL/figureSDZwCf.png" alt="figureSDZwCf.png" />
</p>
</div>
<div id="outline-container-orgcb8b213" class="outline-3">
<h3 id="orgcb8b213"><span class="section-number-3">3.1</span> Question 1</h3>
<div class="outline-text-3" id="text-3-1">
<div class="org-src-container">
<pre class="src src-R">dead_or_alive <span style="color: #008b8b;">&lt;-</span> simpson_data<span style="color: #008b8b;">%&gt;%</span>
group_by(Smoker)<span style="color: #008b8b;">%&gt;%</span>
summarise(Number = n(), Alive = sum(Status == <span style="color: #8b2252;">"Alive"</span>), Dead = sum(Status == <span style="color: #8b2252;">"Dead"</span>), mortality_rate = (sum(Status == <span style="color: #8b2252;">"Dead"</span>)/n())*100)
xtable(dead_or_alive)
<pre class="src src-R">reg_log_fumeur
</pre>
</div>
<pre class="example">
% latex table generated in R 3.6.3 by xtable 1.8-4 package
% Fri Apr 3 16:36:26 2020
\begin{table}[ht]
\centering
\begin{tabular}{rlrrrr}
\hline
&amp; Smoker &amp; Number &amp; Alive &amp; Dead &amp; mortality\_rate \\
\hline
1 &amp; No &amp; 732 &amp; 502 &amp; 230 &amp; 31.42 \\
2 &amp; Yes &amp; 582 &amp; 443 &amp; 139 &amp; 23.88 \\
\hline
\end{tabular}
\end{table}
<div class="figure">
<p><img src="file:///tmp/babel-zzxmyL/figureu04D5C.png" alt="figureu04D5C.png" />
</p>
</div>
<div class="org-src-container">
<pre class="src src-R">reg_log_non_fumeur
</pre>
</div>
<p>
<img src="file:///tmp/babel-zzxmyL/figuregIqfqf.png" alt="figuregIqfqf.png" />
Passer 60 ans, bien que les courbes soient semblable, on voit que l'intervalle de confiance chez les fumeurs
est plus grands que chez les non fumeur. Il est donc possible qu'il y
est moins de mort chez les fumeurs car le corps est habitué à la cigarette.</p>
</div>
</div>
</div>
</div>
<div id="postamble" class="status">
<p class="date">Date: \daily</p>
<p class="author">Auteur: Adam Taheraly</p>
<p class="date">Created: 2020-04-03 ven. 16:36</p>
<p class="date">Created: 2020-04-05 dim. 01:48</p>
<p class="validation"><a href="http://validator.w3.org/check?uri=referer">Validate</a></p>
</div>
</body>
......
<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" lang="fr" xml:lang="fr">
<head>
<!-- 2020-04-03 ven. 16:36 -->
<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<title>Autour du Paradoxe de Simpson</title>
<meta name="generator" content="Org mode" />
<meta name="author" content="Adam Taheraly" />
<style type="text/css">
<!--/*--><![CDATA[/*><!--*/
.title { text-align: center;
margin-bottom: .2em; }
.subtitle { text-align: center;
font-size: medium;
font-weight: bold;
margin-top:0; }
.todo { font-family: monospace; color: red; }
.done { font-family: monospace; color: green; }
.priority { font-family: monospace; color: orange; }
.tag { background-color: #eee; font-family: monospace;
padding: 2px; font-size: 80%; font-weight: normal; }
.timestamp { color: #bebebe; }
.timestamp-kwd { color: #5f9ea0; }
.org-right { margin-left: auto; margin-right: 0px; text-align: right; }
.org-left { margin-left: 0px; margin-right: auto; text-align: left; }
.org-center { margin-left: auto; margin-right: auto; text-align: center; }
.underline { text-decoration: underline; }
#postamble p, #preamble p { font-size: 90%; margin: .2em; }
p.verse { margin-left: 3%; }
pre {
border: 1px solid #ccc;
box-shadow: 3px 3px 3px #eee;
padding: 8pt;
font-family: monospace;
overflow: auto;
margin: 1.2em;
}
pre.src {
position: relative;
overflow: visible;
padding-top: 1.2em;
}
pre.src:before {
display: none;
position: absolute;
background-color: white;
top: -10px;
right: 10px;
padding: 3px;
border: 1px solid black;
}
pre.src:hover:before { display: inline;}
/* Languages per Org manual */
pre.src-asymptote:before { content: 'Asymptote'; }
pre.src-awk:before { content: 'Awk'; }
pre.src-C:before { content: 'C'; }
/* pre.src-C++ doesn't work in CSS */
pre.src-clojure:before { content: 'Clojure'; }
pre.src-css:before { content: 'CSS'; }
pre.src-D:before { content: 'D'; }
pre.src-ditaa:before { content: 'ditaa'; }
pre.src-dot:before { content: 'Graphviz'; }
pre.src-calc:before { content: 'Emacs Calc'; }
pre.src-emacs-lisp:before { content: 'Emacs Lisp'; }
pre.src-fortran:before { content: 'Fortran'; }
pre.src-gnuplot:before { content: 'gnuplot'; }
pre.src-haskell:before { content: 'Haskell'; }
pre.src-hledger:before { content: 'hledger'; }
pre.src-java:before { content: 'Java'; }
pre.src-js:before { content: 'Javascript'; }
pre.src-latex:before { content: 'LaTeX'; }
pre.src-ledger:before { content: 'Ledger'; }
pre.src-lisp:before { content: 'Lisp'; }
pre.src-lilypond:before { content: 'Lilypond'; }
pre.src-lua:before { content: 'Lua'; }
pre.src-matlab:before { content: 'MATLAB'; }
pre.src-mscgen:before { content: 'Mscgen'; }
pre.src-ocaml:before { content: 'Objective Caml'; }
pre.src-octave:before { content: 'Octave'; }
pre.src-org:before { content: 'Org mode'; }
pre.src-oz:before { content: 'OZ'; }
pre.src-plantuml:before { content: 'Plantuml'; }
pre.src-processing:before { content: 'Processing.js'; }
pre.src-python:before { content: 'Python'; }
pre.src-R:before { content: 'R'; }
pre.src-ruby:before { content: 'Ruby'; }
pre.src-sass:before { content: 'Sass'; }
pre.src-scheme:before { content: 'Scheme'; }
pre.src-screen:before { content: 'Gnu Screen'; }
pre.src-sed:before { content: 'Sed'; }
pre.src-sh:before { content: 'shell'; }
pre.src-sql:before { content: 'SQL'; }
pre.src-sqlite:before { content: 'SQLite'; }
/* additional languages in org.el's org-babel-load-languages alist */
pre.src-forth:before { content: 'Forth'; }
pre.src-io:before { content: 'IO'; }
pre.src-J:before { content: 'J'; }
pre.src-makefile:before { content: 'Makefile'; }
pre.src-maxima:before { content: 'Maxima'; }
pre.src-perl:before { content: 'Perl'; }
pre.src-picolisp:before { content: 'Pico Lisp'; }
pre.src-scala:before { content: 'Scala'; }
pre.src-shell:before { content: 'Shell Script'; }
pre.src-ebnf2ps:before { content: 'ebfn2ps'; }
/* additional language identifiers per "defun org-babel-execute"
in ob-*.el */
pre.src-cpp:before { content: 'C++'; }
pre.src-abc:before { content: 'ABC'; }
pre.src-coq:before { content: 'Coq'; }
pre.src-groovy:before { content: 'Groovy'; }
/* additional language identifiers from org-babel-shell-names in
ob-shell.el: ob-shell is the only babel language using a lambda to put
the execution function name together. */
pre.src-bash:before { content: 'bash'; }
pre.src-csh:before { content: 'csh'; }
pre.src-ash:before { content: 'ash'; }
pre.src-dash:before { content: 'dash'; }
pre.src-ksh:before { content: 'ksh'; }
pre.src-mksh:before { content: 'mksh'; }
pre.src-posh:before { content: 'posh'; }
/* Additional Emacs modes also supported by the LaTeX listings package */
pre.src-ada:before { content: 'Ada'; }
pre.src-asm:before { content: 'Assembler'; }
pre.src-caml:before { content: 'Caml'; }
pre.src-delphi:before { content: 'Delphi'; }
pre.src-html:before { content: 'HTML'; }
pre.src-idl:before { content: 'IDL'; }
pre.src-mercury:before { content: 'Mercury'; }
pre.src-metapost:before { content: 'MetaPost'; }
pre.src-modula-2:before { content: 'Modula-2'; }
pre.src-pascal:before { content: 'Pascal'; }
pre.src-ps:before { content: 'PostScript'; }
pre.src-prolog:before { content: 'Prolog'; }
pre.src-simula:before { content: 'Simula'; }
pre.src-tcl:before { content: 'tcl'; }
pre.src-tex:before { content: 'TeX'; }
pre.src-plain-tex:before { content: 'Plain TeX'; }
pre.src-verilog:before { content: 'Verilog'; }
pre.src-vhdl:before { content: 'VHDL'; }
pre.src-xml:before { content: 'XML'; }
pre.src-nxml:before { content: 'XML'; }
/* add a generic configuration mode; LaTeX export needs an additional
(add-to-list 'org-latex-listings-langs '(conf " ")) in .emacs */
pre.src-conf:before { content: 'Configuration File'; }
table { border-collapse:collapse; }
caption.t-above { caption-side: top; }
caption.t-bottom { caption-side: bottom; }
td, th { vertical-align:top; }
th.org-right { text-align: center; }
th.org-left { text-align: center; }
th.org-center { text-align: center; }
td.org-right { text-align: right; }
td.org-left { text-align: left; }
td.org-center { text-align: center; }
dt { font-weight: bold; }
.footpara { display: inline; }
.footdef { margin-bottom: 1em; }
.figure { padding: 1em; }
.figure p { text-align: center; }
.inlinetask {
padding: 10px;
border: 2px solid gray;
margin: 10px;
background: #ffffcc;
}
#org-div-home-and-up
{ text-align: right; font-size: 70%; white-space: nowrap; }
textarea { overflow-x: auto; }
.linenr { font-size: smaller }
.code-highlighted { background-color: #ffff00; }
.org-info-js_info-navigation { border-style: none; }
#org-info-js_console-label
{ font-size: 10px; font-weight: bold; white-space: nowrap; }
.org-info-js_search-highlight
{ background-color: #ffff00; color: #000000; font-weight: bold; }
.org-svg { width: 90%; }
/*]]>*/-->
</style>
<link rel="stylesheet" type="text/css" href="http://www.pirilampo.org/styles/readtheorg/css/htmlize.css"/>
<link rel="stylesheet" type="text/css" href="http://www.pirilampo.org/styles/readtheorg/css/readtheorg.css"/>
<script src="https://ajax.googleapis.com/ajax/libs/jquery/2.1.3/jquery.min.js"></script>
<script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.4/js/bootstrap.min.js"></script>
<script type="text/javascript" src="http://www.pirilampo.org/styles/lib/js/jquery.stickytableheaders.js"></script>
<script type="text/javascript" src="http://www.pirilampo.org/styles/readtheorg/js/readtheorg.js"></script>
<script type="text/javascript">
/*
@licstart The following is the entire license notice for the
JavaScript code in this tag.
Copyright (C) 2012-2019 Free Software Foundation, Inc.
The JavaScript code in this tag is free software: you can
redistribute it and/or modify it under the terms of the GNU
General Public License (GNU GPL) as published by the Free Software
Foundation, either version 3 of the License, or (at your option)
any later version. The code is distributed WITHOUT ANY WARRANTY;
without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU GPL for more details.
As additional permission under GNU GPL version 3 section 7, you
may distribute non-source (e.g., minimized or compacted) forms of
that code without the copy of the GNU GPL normally required by
section 4, provided you include this license notice and a URL
through which recipients can access the Corresponding Source.
@licend The above is the entire license notice
for the JavaScript code in this tag.
*/
<!--/*--><![CDATA[/*><!--*/
function CodeHighlightOn(elem, id)
{
var target = document.getElementById(id);
if(null != target) {
elem.cacheClassElem = elem.className;
elem.cacheClassTarget = target.className;
target.className = "code-highlighted";
elem.className = "code-highlighted";
}
}
function CodeHighlightOff(elem, id)
{
var target = document.getElementById(id);
if(elem.cacheClassElem)
elem.className = elem.cacheClassElem;
if(elem.cacheClassTarget)
target.className = elem.cacheClassTarget;
}
/*]]>*///-->
</script>
</head>
<body>
<div id="content">
<h1 class="title">Autour du Paradoxe de Simpson</h1>
<div id="table-of-contents">
<h2>Table des matières</h2>
<div id="text-table-of-contents">
<ul>
<li><a href="#org228e6c5">1. Chargement des librairies R</a></li>
<li><a href="#org2059b8b">2. Mise en place des données</a>
<ul>
<li><a href="#orgd49b650">2.1. Chargement</a></li>
<li><a href="#orgd27fa49">2.2. Vérification</a></li>
</ul>
</li>
<li><a href="#org244e357">3. Analyse des données</a>
<ul>
<li><a href="#orgcb8b213">3.1. Question 1</a></li>
</ul>
</li>
</ul>
</div>
</div>
<div id="outline-container-org228e6c5" class="outline-2">
<h2 id="org228e6c5"><span class="section-number-2">1</span> Chargement des librairies R</h2>
</div>
<div id="outline-container-org2059b8b" class="outline-2">
<h2 id="org2059b8b"><span class="section-number-2">2</span> Mise en place des données</h2>
<div class="outline-text-2" id="text-2">
</div>
<div id="outline-container-orgd49b650" class="outline-3">
<h3 id="orgd49b650"><span class="section-number-3">2.1</span> Chargement</h3>
<div class="outline-text-3" id="text-2-1">
<div class="org-src-container">
<pre class="src src-R">simpson_data <span style="color: #008b8b;">&lt;-</span> read.csv(<span style="color: #8b2252;">"Subject6_smoking.csv"</span>)
</pre>
</div>
<p>
Les données sont sous la forme :
</p>
<table border="2" cellspacing="0" cellpadding="6" rules="groups" frame="hsides">
<colgroup>
<col class="org-left" />
<col class="org-left" />
</colgroup>
<thead>
<tr>
<th scope="col" class="org-left">Libellé</th>
<th scope="col" class="org-left">Description</th>
</tr>
</thead>
<tbody>
<tr>
<td class="org-left">Smoker</td>
<td class="org-left">Fume (Yes) ou non (No)</td>
</tr>
<tr>
<td class="org-left">Status</td>
<td class="org-left">En vie (Alive) ou mort (Dead)</td>
</tr>
<tr>
<td class="org-left">Age</td>
<td class="org-left">Age du sujet</td>
</tr>
</tbody>
</table>
</div>
</div>
<div id="outline-container-orgd27fa49" class="outline-3">
<h3 id="orgd27fa49"><span class="section-number-3">2.2</span> Vérification</h3>
<div class="outline-text-3" id="text-2-2">
<p>
Vérification du bon chargement des données en visualisant le début et
la fin des données.
</p>
<div class="org-src-container">
<pre class="src src-R">head(simpson_data)
tail(simpson_data)
</pre>
</div>
<pre class="example">
Smoker Status Age
1 Yes Alive 21.0
2 Yes Alive 19.3
3 No Dead 57.5
4 No Alive 47.1
5 Yes Alive 81.4
6 No Alive 36.8
Smoker Status Age
1309 No Alive 42.1
1310 Yes Alive 35.9
1311 No Alive 22.3
1312 Yes Dead 62.1
1313 No Dead 88.6
1314 No Alive 39.1
</pre>
<p>
Vérification de la présence de données manquante.
</p>
<div class="org-src-container">
<pre class="src src-R">na_records <span style="color: #008b8b;">&lt;-</span> apply(simpson_data, 1, <span style="color: #a020f0;">function</span>(x) any(is.na(x)))
simpson_data[na_records,]
</pre>
</div>
<pre class="example">
[1] Smoker Status Age
&lt;0 lignes&gt; (ou 'row.names' de longueur nulle)
</pre>
<p>
Vérification de la classe des données et du nombre de data.
</p>
<div class="org-src-container">
<pre class="src src-R">nrow(simpson_data)
class(simpson_data$Smoker)
class(simpson_data$Status)
class(simpson_data$Age)
</pre>
</div>
<pre class="example">
[1] 1314
[1] "factor"
[1] "factor"
[1] "numeric"
</pre>
</div>
</div>
</div>
<div id="outline-container-org244e357" class="outline-2">
<h2 id="org244e357"><span class="section-number-2">3</span> Analyse des données</h2>
<div class="outline-text-2" id="text-3">
</div>
<div id="outline-container-orgcb8b213" class="outline-3">
<h3 id="orgcb8b213"><span class="section-number-3">3.1</span> Question 1</h3>
<div class="outline-text-3" id="text-3-1">
<div class="org-src-container">
<pre class="src src-R">dead_or_alive <span style="color: #008b8b;">&lt;-</span> simpson_data<span style="color: #008b8b;">%&gt;%</span>
group_by(Smoker)<span style="color: #008b8b;">%&gt;%</span>
summarise(Number = n(), Alive = sum(Status == <span style="color: #8b2252;">"Alive"</span>), Dead = sum(Status == <span style="color: #8b2252;">"Dead"</span>), mortality_rate = (sum(Status == <span style="color: #8b2252;">"Dead"</span>)/n())*100)
xtable(dead_or_alive)
</pre>
</div>
<pre class="example">
% latex table generated in R 3.6.3 by xtable 1.8-4 package
% Fri Apr 3 16:36:26 2020
\begin{table}[ht]
\centering
\begin{tabular}{rlrrrr}
\hline
&amp; Smoker &amp; Number &amp; Alive &amp; Dead &amp; mortality\_rate \\
\hline
1 &amp; No &amp; 732 &amp; 502 &amp; 230 &amp; 31.42 \\
2 &amp; Yes &amp; 582 &amp; 443 &amp; 139 &amp; 23.88 \\
\hline
\end{tabular}
\end{table}
</pre>
</div>
</div>
</div>
</div>
<div id="postamble" class="status">
<p class="date">Date: \daily</p>
<p class="author">Auteur: Adam Taheraly</p>
<p class="date">Created: 2020-04-03 ven. 16:36</p>
<p class="validation"><a href="http://validator.w3.org/check?uri=referer">Validate</a></p>
</div>
</body>
</html>
......@@ -17,6 +17,7 @@ library(devtools) # Obtention des informations sur la session R
library(dplyr) # Manipulation des données
library(tidyverse) # Collection de packages pour la data science
library(knitr) # Génération de rapports dynamique
library(forcats)
library(broman) # Function intéressante pour R
library(RColorBrewer) # Couleur des figures
library(ggplot2) # Création des figures
......@@ -25,41 +26,6 @@ library(xtable) # Jolie tableau
library(kableExtra) # Création de tableau
#+end_src
#+RESULTS:
#+begin_example
Le chargement a nécessité le package : usethis
Attachement du package : ‘dplyr’
The following objects are masked from ‘package:stats’:
filter, lag
The following objects are masked from ‘package:base’:
intersect, setdiff, setequal, union
── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
✔ ggplot2 3.3.0 ✔ purrr  0.3.3
✔ tibble  3.0.0 ✔ stringr 1.4.0
✔ tidyr  1.0.2 ✔ forcats 0.5.0
✔ readr  1.3.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
Attachement du package : ‘broman’
The following object is masked from ‘package:purrr’:
done
Attachement du package : ‘kableExtra’
The following object is masked from ‘package:dplyr’:
group_rows
#+end_example
* Mise en place des données :noexport:
** Chargement
......@@ -67,8 +33,6 @@ The following object is masked from ‘package:dplyr’:
simpson_data <- read.csv("Subject6_smoking.csv")
#+end_src
#+RESULTS:
Les données sont sous la forme :
| Libellé | Description |
......@@ -86,35 +50,12 @@ head(simpson_data)
tail(simpson_data)
#+end_src
#+RESULTS:
#+begin_example
Smoker Status Age
1 Yes Alive 21.0
2 Yes Alive 19.3
3 No Dead 57.5
4 No Alive 47.1
5 Yes Alive 81.4
6 No Alive 36.8
Smoker Status Age
1309 No Alive 42.1
1310 Yes Alive 35.9
1311 No Alive 22.3
1312 Yes Dead 62.1
1313 No Dead 88.6
1314 No Alive 39.1
#+end_example
Vérification de la présence de données manquante.
#+begin_src R :results output :session *R* :exports both
na_records <- apply(simpson_data, 1, function(x) any(is.na(x)))
simpson_data[na_records,]
#+end_src
#+RESULTS:
:
: [1] Smoker Status Age
: <0 lignes> (ou 'row.names' de longueur nulle)
Vérification de la classe des données et du nombre de data.
#+begin_src R :results output :session *R* :exports both
......@@ -124,14 +65,6 @@ class(simpson_data$Status)
class(simpson_data$Age)
#+end_src
#+RESULTS:
: [1] 1314
:
: [1] "factor"
:
: [1] "factor"
:
: [1] "numeric"
* Analyse des données
** Question 1
......@@ -142,18 +75,10 @@ summarise(Number = n(), Alive = sum(Status == "Alive"), Dead = sum(Status == "De
dead_or_alive$mortality <- round(dead_or_alive$mortality, 2)
#+end_src
#+RESULTS:
#+begin_src R :results table :colnames yes :session *R* :exports results
xtable(dead_or_alive)
#+end_src
#+RESULTS:
| Smoker | Number | Alive | Dead | mortality |
|--------+--------+-------+------+-----------|
| No | 732 | 502 | 230 | 31.42 |
| Yes | 582 | 443 | 139 | 23.88 |
#+begin_src R :results output graphics :file (org-babel-temp-file "figure" ".png") :exports both :width 600 :height 400 :session *R*
mortality <- ggplot(dead_or_alive, aes(x = Smoker, y = mortality, fill = Smoker)) + geom_bar(stat = "identity") +
......@@ -161,13 +86,9 @@ mortality <- ggplot(dead_or_alive, aes(x = Smoker, y = mortality, fill = Smoker)
mortality
#+end_src
#+RESULTS:
[[file:/tmp/babel-pNwaHh/figureO5jUUE.png]]
Le résultat est surprenant car on observe un plus fort taux de
mortalité chez les non-fumeur que chez les fumeur.
** Question 2
On recrée un data-frame pour ne pas interagir avec les résultats
précédent.
......@@ -176,17 +97,13 @@ simpson_data_q2 <- simpson_data
simpson_data_q2$Age <- as.numeric(simpson_data_q2$Age)
#+end_src
#+RESULTS:
On regroupe les ages en catégories
#+begin_src R :results output :session *R* :exports both
simpson_data_q2$Age <- cut(simpson_data_q2$Age, c(18, 34, 54, 64, 200), include.lowest = TRUE, labels = c("18-34", "35-54", "55-64", "plus de 65"))
table(simpson_data_q2$Age)
#+end_src
#+RESULTS:
:
: 18-34 35-54 55-64 plus de 65
: 400 436 236 242
#+begin_src R :results output :session *R* :exports both
dead_or_alive2 <- simpson_data_q2%>%
......@@ -198,23 +115,9 @@ mutate(mortality = (Dead/sum(Number))*100)
dead_or_alive$mortality <- round(dead_or_alive$mortality, 2)
#+end_src
#+RESULTS:
#+begin_src R :results table :colnames yes :session *R* :exports results
xtable(dead_or_alive2)
#+end_src
#+RESULTS:
| Smoker | Age | Number | Alive | Dead | mortality |
|--------+------------+--------+-------+------+-------------------|
| No | 18-34 | 219 | 213 | 6 | 0.819672131147541 |
| No | 35-54 | 199 | 180 | 19 | 2.59562841530055 |
| No | 55-64 | 121 | 81 | 40 | 5.46448087431694 |
| No | plus de 65 | 193 | 28 | 165 | 22.5409836065574 |
| Yes | 18-34 | 181 | 176 | 5 | 0.859106529209622 |
| Yes | 35-54 | 237 | 196 | 41 | 7.0446735395189 |
| Yes | 55-64 | 115 | 64 | 51 | 8.76288659793814 |
| Yes | plus de 65 | 49 | 7 | 42 | 7.21649484536082 |
#+begin_src R :results output graphics :file (org-babel-temp-file "figure" ".png") :exports both :width 600 :height 400 :session *R*
mortality2 <- ggplot(dead_or_alive2, aes(x = Smoker, y = mortality, fill = Age))+ geom_bar(stat = "identity", position = position_dodge()) +
......@@ -222,15 +125,65 @@ mortality2 <- ggplot(dead_or_alive2, aes(x = Smoker, y = mortality, fill = Age))
mortality2
#+end_src
#+RESULTS:
[[file:/tmp/babel-fRQzWg/figureR2aE0F.png]]
On observe une diminution du taux de mortalité chez les moins de 65
ans parmis les non fumeur par rapport au fumeur. Cependant, chez les
plus de 65 ans, la tendence s'inverse avec un plus grande mortalité
chez les fumeurs par rappor au non fumeur (3 fois plus).
Cela peut s'expliquer par le fait que le corps des fumeurs se soit
habituer aux toxines alors que les non-fumeur subissant le tabagisme
passif et ayant atteint un age où ils sont plus à risque soit plus à
même de décédé à cause des toxines du tabac.
L'inversion de ces proportions peut s'expliquer par un enrichissement
en personnes agées chez les non fumeurs.
** Question 3
On recrée un data-frame pour ne pas interagir avec les résultats
précédent.
#+begin_src R :results output :session *R* :exports both
simpson_data_q3 <- simpson_data
simpson_data_q3$Death <- factor(simpson_data_q3$Status)
simpson_data_q3$Death <- fct_recode(simpson_data_q3$Death, "1"="Alive", "0"="Dead")
simpson_data_q3$Death <- as.numeric(as.character(simpson_data_q3$Death))
simpson_data_q3$Age <- as.numeric(as.character(simpson_data_q3$Age))
#+end_src
#+begin_src R :results output :session *R* :exports both
head(simpson_data_q3)
#+end_src
#+begin_src R :results output :session *R* :exports both
reg_log_total <- ggplot(simpson_data_q3, aes(x=Age,y=Death)) + geom_point(alpha=.3,size=3) +
theme_bw() +
geom_smooth(method = "glm",
method.args = list(family = "binomial"),fullrange = TRUE)
reg_log_fumeur <- ggplot(simpson_data_q3[simpson_data_q3$Smoker == "Yes",], aes(x=Age,y=Death)) + geom_point(alpha=.3,size=3) +
theme_bw() +
geom_smooth(method = "glm",
method.args = list(family = "binomial"),fullrange = TRUE)
reg_log_non_fumeur <- ggplot(simpson_data_q3[simpson_data_q3$Smoker == "No",], aes(x=Age,y=Death)) + geom_point(alpha=.3,size=3) +
theme_bw() +
geom_smooth(method = "glm",
method.args = list(family = "binomial"),fullrange = TRUE)
#+end_src
#+begin_src R :results output graphics :file (org-babel-temp-file "figure" ".png") :exports both :width 600 :height 400 :session *R*
reg_log_total
#+end_src
#+begin_src R :results output graphics :file (org-babel-temp-file "figure" ".png") :exports both :width 600 :height 400 :session *R*
reg_log_fumeur
#+end_src
#+begin_src R :results output graphics :file (org-babel-temp-file "figure" ".png") :exports both :width 600 :height 400 :session *R*
reg_log_non_fumeur
#+end_src
Passer 60 ans, bien que les courbes soient semblable, on voit que l'intervalle de confiance chez les fumeurs
est plus grands que chez les non fumeur. Cela s'explique bien par une
plus faible population de personne agée chez les fumeurs.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment