From dc53af0f02826bf9640f96553c8eaa295fa060d1 Mon Sep 17 00:00:00 2001 From: David Elser Date: Thu, 4 Mar 2021 14:18:37 +0100 Subject: [PATCH] Filter motfis --- module2/exo4/exercice_en.Rmd | 48 +-- module2/exo4/exercice_en.html | 544 ++++++++++++++++++++++++++++++++++ 2 files changed, 574 insertions(+), 18 deletions(-) create mode 100644 module2/exo4/exercice_en.html diff --git a/module2/exo4/exercice_en.Rmd b/module2/exo4/exercice_en.Rmd index 13b258d..e7d08bf 100644 --- a/module2/exo4/exercice_en.Rmd +++ b/module2/exo4/exercice_en.Rmd @@ -1,33 +1,45 @@ --- -title: "Your title" -author: "Your name" -date: "Today's date" +title: "Filter Motifs from MS2LDA" +author: "David Elser" +date: "04.03.2021" output: html_document --- +#Read a Simple csv File - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = TRUE) +```{r} +mydata = read.csv("MS2LDA.csv") ``` -## Some explanations +```{r} +head(mydata) +str(mydata) +``` -This is an R Markdown document that you can easily export to HTML, PDF, and MS Word formats. For more information on R Markdown, see . +# Count Motifs and print top occurences +```{r} +library(dplyr) +motifnb=mydata %>% + group_by(Motif) %>% + tally() +top=motifnb[with(motifnb,order(-n)),] -When you click on the button **Knit**, the document will be compiled in order to re-execute the R code and to include the results into the final document. As we have shown in the video, R code is inserted as follows: +head(top) +``` +# Create List with specific motifs -```{r cars} -summary(cars) +```{r} +removed=mydata[(mydata$Motif == "motif_534" | mydata$Motif == "motif_667"), ] +head(removed) +str(removed) ``` -It is also straightforward to include figures. For example: +# Removed motifs according filterd list -```{r pressure, echo=FALSE} -plot(pressure) -``` +```{r} -Note the parameter `echo = FALSE` that indicates that the code will not appear in the final version of the document. We recommend not to use this parameter in the context of this MOOC, because we want your data analyses to be perfectly transparent and reproducible. +shortlist=setdiff(mydata, removed) +head(shortlist) +str(shortlist) +``` -Since the results are not stored in Rmd files, you should generate an HTML or PDF version of your exercises and commit them. Otherwise reading and checking your analysis will be difficult for anyone else but you. -Now it's your turn! You can delete all this information and replace it by your computational document. diff --git a/module2/exo4/exercice_en.html b/module2/exo4/exercice_en.html new file mode 100644 index 0000000..965b528 --- /dev/null +++ b/module2/exo4/exercice_en.html @@ -0,0 +1,544 @@ + + + + + + + + + + + + + + +Your title + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + +

#Read a Simple csv File

+
mydata = read.csv("MS2LDA.csv")
+
head(mydata)
+
##   row.ID Document                    Motif Probability Overlap.Score
+## 1     14       14 urine_mass2motif_260.m2m       0.391         0.399
+## 2     14       14        gnps_motif_40.m2m       0.313         0.947
+## 3     14       14  urine_mass2motif_90.m2m       0.268         0.350
+## 4     20       20                motif_536       0.890         0.543
+## 5     20       20        gnps_motif_49.m2m       0.101         1.000
+## 6     36       36 urine_mass2motif_260.m2m       0.371         0.400
+##   Precursor.Mass Retention.Time Document.Annotation
+## 1       149.0232           None                None
+## 2       149.0232           None                None
+## 3       149.0232           None                None
+## 4       607.2922           None                None
+## 5       607.2922           None                None
+## 6       149.0232           None                None
+
str(mydata)
+
## 'data.frame':    9468 obs. of  8 variables:
+##  $ row.ID             : int  14 14 14 20 20 36 36 36 37 37 ...
+##  $ Document           : int  14 14 14 20 20 36 36 36 37 37 ...
+##  $ Motif              : Factor w/ 370 levels "StrepSalini_motif_110.m2m",..: 362 28 370 190 31 362 28 370 362 364 ...
+##  $ Probability        : num  0.391 0.313 0.268 0.89 0.101 0.371 0.306 0.285 0.465 0.463 ...
+##  $ Overlap.Score      : num  0.399 0.947 0.35 0.543 1 0.4 0.947 0.363 0.421 0.426 ...
+##  $ Precursor.Mass     : num  149 149 149 607 607 ...
+##  $ Retention.Time     : Factor w/ 1 level "None": 1 1 1 1 1 1 1 1 1 1 ...
+##  $ Document.Annotation: Factor w/ 1 level "None": 1 1 1 1 1 1 1 1 1 1 ...
+
+

Count Motifs and print top occurences

+
library(dplyr)
+
## 
+## Attaching package: 'dplyr'
+
## The following objects are masked from 'package:stats':
+## 
+##     filter, lag
+
## The following objects are masked from 'package:base':
+## 
+##     intersect, setdiff, setequal, union
+
motifnb=mydata %>%
+        group_by(Motif) %>%
+        tally()
+top=motifnb[with(motifnb,order(-n)),]
+
+head(top)
+
## # A tibble: 6 x 2
+##   Motif         n
+##   <fct>     <int>
+## 1 motif_534   814
+## 2 motif_667   612
+## 3 motif_557   608
+## 4 motif_579   440
+## 5 motif_413   310
+## 6 motif_519   250
+
+
+

Create List with specific motifs

+
removed=mydata[(mydata$Motif == "motif_534" | mydata$Motif == "motif_667"), ]
+head(removed)
+
##     row.ID Document     Motif Probability Overlap.Score Precursor.Mass
+## 65     105      105 motif_534       0.576         0.972       482.4054
+## 106    142      142 motif_667       0.419         0.987       496.4210
+## 111    145      145 motif_534       0.547         0.891       570.4579
+## 125    156      156 motif_534       0.324         0.994       496.4210
+## 139    164      164 motif_534       0.551         0.894       570.4579
+## 146    168      168 motif_534       0.879         0.969       540.4473
+##     Retention.Time Document.Annotation
+## 65            None                None
+## 106           None                None
+## 111           None                None
+## 125           None                None
+## 139           None                None
+## 146           None                None
+
str(removed)
+
## 'data.frame':    1426 obs. of  8 variables:
+##  $ row.ID             : int  105 142 145 156 164 168 180 189 205 208 ...
+##  $ Document           : int  105 142 145 156 164 168 180 189 205 208 ...
+##  $ Motif              : Factor w/ 370 levels "StrepSalini_motif_110.m2m",..: 188 319 188 188 188 188 188 188 319 319 ...
+##  $ Probability        : num  0.576 0.419 0.547 0.324 0.551 0.879 0.656 0.643 0.416 0.383 ...
+##  $ Overlap.Score      : num  0.972 0.987 0.891 0.994 0.894 0.969 0.957 0.988 0.993 0.989 ...
+##  $ Precursor.Mass     : num  482 496 570 496 570 ...
+##  $ Retention.Time     : Factor w/ 1 level "None": 1 1 1 1 1 1 1 1 1 1 ...
+##  $ Document.Annotation: Factor w/ 1 level "None": 1 1 1 1 1 1 1 1 1 1 ...
+
+
+

Removed motifs according filterd list

+
shortlist=setdiff(mydata, removed)
+head(shortlist)
+
##   row.ID Document                    Motif Probability Overlap.Score
+## 1     14       14 urine_mass2motif_260.m2m       0.391         0.399
+## 2     14       14        gnps_motif_40.m2m       0.313         0.947
+## 3     14       14  urine_mass2motif_90.m2m       0.268         0.350
+## 4     20       20                motif_536       0.890         0.543
+## 5     20       20        gnps_motif_49.m2m       0.101         1.000
+## 6     36       36 urine_mass2motif_260.m2m       0.371         0.400
+##   Precursor.Mass Retention.Time Document.Annotation
+## 1       149.0232           None                None
+## 2       149.0232           None                None
+## 3       149.0232           None                None
+## 4       607.2922           None                None
+## 5       607.2922           None                None
+## 6       149.0232           None                None
+
str(shortlist)
+
## 'data.frame':    8042 obs. of  8 variables:
+##  $ row.ID             : int  14 14 14 20 20 36 36 36 37 37 ...
+##  $ Document           : int  14 14 14 20 20 36 36 36 37 37 ...
+##  $ Motif              : Factor w/ 370 levels "StrepSalini_motif_110.m2m",..: 362 28 370 190 31 362 28 370 362 364 ...
+##  $ Probability        : num  0.391 0.313 0.268 0.89 0.101 0.371 0.306 0.285 0.465 0.463 ...
+##  $ Overlap.Score      : num  0.399 0.947 0.35 0.543 1 0.4 0.947 0.363 0.421 0.426 ...
+##  $ Precursor.Mass     : num  149 149 149 607 607 ...
+##  $ Retention.Time     : Factor w/ 1 level "None": 1 1 1 1 1 1 1 1 1 1 ...
+##  $ Document.Annotation: Factor w/ 1 level "None": 1 1 1 1 1 1 1 1 1 1 ...
+
+

Some explanations

+

This is an R Markdown document that you can easily export to HTML, PDF, and MS Word formats. For more information on R Markdown, see http://rmarkdown.rstudio.com.

+

When you click on the button Knit, the document will be compiled in order to re-execute the R code and to include the results into the final document. As we have shown in the video, R code is inserted as follows:

+
summary(cars)
+
##      speed           dist       
+##  Min.   : 4.0   Min.   :  2.00  
+##  1st Qu.:12.0   1st Qu.: 26.00  
+##  Median :15.0   Median : 36.00  
+##  Mean   :15.4   Mean   : 42.98  
+##  3rd Qu.:19.0   3rd Qu.: 56.00  
+##  Max.   :25.0   Max.   :120.00
+

It is also straightforward to include figures. For example:

+

+

Note the parameter echo = FALSE that indicates that the code will not appear in the final version of the document. We recommend not to use this parameter in the context of this MOOC, because we want your data analyses to be perfectly transparent and reproducible.

+

Since the results are not stored in Rmd files, you should generate an HTML or PDF version of your exercises and commit them. Otherwise reading and checking your analysis will be difficult for anyone else but you.

+

Now it’s your turn! You can delete all this information and replace it by your computational document.

+
+
+ + + + +
+ + + + + + + + -- 2.18.1