R introduction

  1. Calculate mean for every column of iris data frame, except the “Species” one and combine result into list

  2. Calculate mean for each row excluding data from column “Species” and show it as a vector

  3. Create a random DNA 1000 nucleotides long, calculate amount of A and T in it and write it into named vector ‘dna_at’

  4. Generate a random set of alphabet letters, 10000 characters long and count the number of vowels.

  5. Sort all species in iris table by it’s mean sepal length. The result should be a vector with the species names in correct order

  6. Write a function to calculate vector median

  7. Build graphs of sepal length dependence from petal length for each of dataframe iris species (for each spiecies treated as factor)

  8. Using data from dataframe ‘diamonds’(appears after enabling ggplot2 package) calculate mean price per carat for each category of clarity

Passing activities

Using data obtained from the next link, build the best fitted models for chosen variables for given period of time:

  • Arsalan - build model for variable co2_flux using data from summer months
  • Olga - build model for variable h2o_flux using data from autumn months

Example

#better use code from Import Dataset Wizard 
library(readr)
## Warning: package 'readr' was built under R version 3.3.2
eddypro <- read_csv("C:/Users/iaros/YandexDisk/Jobs/EDDY/environmental-modelling.github.io/eddypro.csv",  skip = 1)
## Warning: Duplicated column names deduplicated: 'skweness_kurtosis' =>
## 'skweness_kurtosis_1' [93], 'discontinuities' => 'discontinuities_1' [95],
## 'timelag' => 'timelag_1' [97], 'co2' => 'co2_1' [127], 'h2o' =>
## 'h2o_1' [128]
## Parsed with column specification:
## cols(
##   .default = col_character()
## )
## See spec(...) for full column specifications.
table_names = names(eddypro)
data <- read_csv("C:/Users/iaros/YandexDisk/Jobs/EDDY/environmental-modelling.github.io/eddypro.csv",  skip = 3, col_names = table_names)
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   filename = col_character(),
##   date = col_date(format = ""),
##   time = col_time(format = ""),
##   daytime = col_character(),
##   co2_def_timelag = col_character(),
##   h2o_def_timelag = col_character(),
##   model = col_character(),
##   spikes = col_character(),
##   amplitude_resolution = col_character(),
##   drop_out = col_character(),
##   absolute_limits = col_character(),
##   skweness_kurtosis = col_character(),
##   skweness_kurtosis_1 = col_character(),
##   discontinuities = col_character(),
##   discontinuities_1 = col_character(),
##   timelag = col_character(),
##   timelag_1 = col_character(),
##   attack_angle = col_character(),
##   non_steady_wind = col_character()
## )
## See spec(...) for full column specifications.
data[data == -9999] = NA
summer_data = data.frame(data[data$DOY > 150 & data$DOY < 241,])

#summer_data$`x_70%`
cor_vector = c()
names(summer_data)
##   [1] "filename"                 "date"                    
##   [3] "time"                     "DOY"                     
##   [5] "daytime"                  "file_records"            
##   [7] "used_records"             "Tau"                     
##   [9] "qc_Tau"                   "rand_err_Tau"            
##  [11] "H"                        "qc_H"                    
##  [13] "rand_err_H"               "LE"                      
##  [15] "qc_LE"                    "rand_err_LE"             
##  [17] "co2_flux"                 "qc_co2_flux"             
##  [19] "rand_err_co2_flux"        "h2o_flux"                
##  [21] "qc_h2o_flux"              "rand_err_h2o_flux"       
##  [23] "H_strg"                   "LE_strg"                 
##  [25] "co2_strg"                 "h2o_strg"                
##  [27] "co2_v.adv"                "h2o_v.adv"               
##  [29] "co2_molar_density"        "co2_mole_fraction"       
##  [31] "co2_mixing_ratio"         "co2_time_lag"            
##  [33] "co2_def_timelag"          "h2o_molar_density"       
##  [35] "h2o_mole_fraction"        "h2o_mixing_ratio"        
##  [37] "h2o_time_lag"             "h2o_def_timelag"         
##  [39] "sonic_temperature"        "air_temperature"         
##  [41] "air_pressure"             "air_density"             
##  [43] "air_heat_capacity"        "air_molar_volume"        
##  [45] "water_vapor_density"      "e"                       
##  [47] "es"                       "specific_humidity"       
##  [49] "RH"                       "VPD"                     
##  [51] "Tdew"                     "u_unrot"                 
##  [53] "v_unrot"                  "w_unrot"                 
##  [55] "u_rot"                    "v_rot"                   
##  [57] "w_rot"                    "wind_speed"              
##  [59] "max_speed"                "wind_dir"                
##  [61] "yaw"                      "pitch"                   
##  [63] "roll"                     "u."                      
##  [65] "TKE"                      "L"                       
##  [67] "X.z.d..L"                 "bowen_ratio"             
##  [69] "T."                       "model"                   
##  [71] "x_peak"                   "x_offset"                
##  [73] "x_10."                    "x_30."                   
##  [75] "x_50."                    "x_70."                   
##  [77] "x_90."                    "un_Tau"                  
##  [79] "Tau_scf"                  "un_H"                    
##  [81] "H_scf"                    "un_LE"                   
##  [83] "LE_scf"                   "un_co2_flux"             
##  [85] "co2_scf"                  "un_h2o_flux"             
##  [87] "h2o_scf"                  "spikes"                  
##  [89] "amplitude_resolution"     "drop_out"                
##  [91] "absolute_limits"          "skweness_kurtosis"       
##  [93] "skweness_kurtosis_1"      "discontinuities"         
##  [95] "discontinuities_1"        "timelag"                 
##  [97] "timelag_1"                "attack_angle"            
##  [99] "non_steady_wind"          "u_spikes"                
## [101] "v_spikes"                 "w_spikes"                
## [103] "ts_spikes"                "co2_spikes"              
## [105] "h2o_spikes"               "head_detect"             
## [107] "t_out"                    "t_in"                    
## [109] "aux_in"                   "delta_p"                 
## [111] "chopper"                  "detector"                
## [113] "pll"                      "sync"                    
## [115] "mean_value"               "u_var"                   
## [117] "v_var"                    "w_var"                   
## [119] "ts_var"                   "co2_var"                 
## [121] "h2o_var"                  "w.ts_cov"                
## [123] "w.co2_cov"                "w.h2o_cov"               
## [125] "co2"                      "h2o"                     
## [127] "co2_1"                    "h2o_1"                   
## [129] "co2_signal_strength_7200" "h2o_signal_strength_7200"
## [131] "flowrate"
summer_data = summer_data[,c(-1,-2,-3,-5, -33,-38,-70, -88:-131)]
cor_vector = c()
for (i in 1:length(names(summer_data))){
  x = summer_data[,i]
  y = summer_data$x_70.
  cor_vector = c(cor_vector, cor(x,y,use = "na.or.complete"))
}
## Warning in cor(x, y, use = "na.or.complete"): стандартное отклонение
## нулевое
## Warning in cor(x, y, use = "na.or.complete"): стандартное отклонение
## нулевое

## Warning in cor(x, y, use = "na.or.complete"): стандартное отклонение
## нулевое

## Warning in cor(x, y, use = "na.or.complete"): стандартное отклонение
## нулевое
names(cor_vector) = names(summer_data)
cor_vector[cor_vector^2 > .1]
##       <NA>       <NA>       <NA>       <NA>       <NA>         u. 
##         NA         NA         NA         NA         NA -0.3473370 
##     x_peak   x_offset      x_10.      x_30.      x_50.      x_70. 
##  0.9250022  0.9381630  0.9709691  0.9895082  0.9965494  1.0000000 
##      x_90.    Tau_scf      H_scf     LE_scf    co2_scf    h2o_scf 
##  0.7003597  0.3659544  0.4575340  0.3903480  0.3903480  0.3903480
# H_scf  LE_scf  co2_scf  h2o_scf x_peak

model1 = lm(x_70.~(H_scf+LE_scf+co2_scf+h2o_scf+x_peak)^5, data = summer_data)
summary(model1)
## 
## Call:
## lm(formula = x_70. ~ (H_scf + LE_scf + co2_scf + h2o_scf + x_peak)^5, 
##     data = summer_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1264.9   -74.3   -35.8    77.6  6638.1 
## 
## Coefficients: (16 not defined because of singularities)
##                                       Estimate Std. Error t value Pr(>|t|)
## (Intercept)                         -2.950e+04  7.460e+03  -3.954 7.83e-05
## H_scf                                2.907e+04  8.526e+03   3.409 0.000658
## LE_scf                               3.057e+04  5.712e+03   5.353 9.23e-08
## co2_scf                                     NA         NA      NA       NA
## h2o_scf                                     NA         NA      NA       NA
## x_peak                              -3.460e+01  4.100e+01  -0.844 0.398777
## H_scf:LE_scf                        -3.183e+04  7.805e+03  -4.078 4.65e-05
## H_scf:co2_scf                               NA         NA      NA       NA
## H_scf:h2o_scf                               NA         NA      NA       NA
## H_scf:x_peak                         1.239e+01  4.679e+01   0.265 0.791235
## LE_scf:co2_scf                      -5.517e+03  8.716e+02  -6.330 2.76e-10
## LE_scf:h2o_scf                              NA         NA      NA       NA
## LE_scf:x_peak                        1.699e+02  2.034e+01   8.351  < 2e-16
## co2_scf:h2o_scf                             NA         NA      NA       NA
## co2_scf:x_peak                              NA         NA      NA       NA
## h2o_scf:x_peak                              NA         NA      NA       NA
## H_scf:LE_scf:co2_scf                 7.513e+03  7.481e+02  10.043  < 2e-16
## H_scf:LE_scf:h2o_scf                        NA         NA      NA       NA
## H_scf:LE_scf:x_peak                 -1.486e+02  2.885e+01  -5.149 2.76e-07
## H_scf:co2_scf:h2o_scf                       NA         NA      NA       NA
## H_scf:co2_scf:x_peak                        NA         NA      NA       NA
## H_scf:h2o_scf:x_peak                        NA         NA      NA       NA
## LE_scf:co2_scf:h2o_scf              -5.705e+02  1.328e+02  -4.296 1.78e-05
## LE_scf:co2_scf:x_peak               -2.289e+01  3.374e+00  -6.783 1.38e-11
## LE_scf:h2o_scf:x_peak                       NA         NA      NA       NA
## co2_scf:h2o_scf:x_peak                      NA         NA      NA       NA
## H_scf:LE_scf:co2_scf:h2o_scf        -7.849e+00  2.380e+01  -0.330 0.741546
## H_scf:LE_scf:co2_scf:x_peak          3.062e+01  4.537e+00   6.750 1.72e-11
## H_scf:LE_scf:h2o_scf:x_peak                 NA         NA      NA       NA
## H_scf:co2_scf:h2o_scf:x_peak                NA         NA      NA       NA
## LE_scf:co2_scf:h2o_scf:x_peak       -1.828e+00  7.677e-01  -2.381 0.017327
## H_scf:LE_scf:co2_scf:h2o_scf:x_peak -3.183e-01  1.416e-01  -2.247 0.024693
##                                        
## (Intercept)                         ***
## H_scf                               ***
## LE_scf                              ***
## co2_scf                                
## h2o_scf                                
## x_peak                                 
## H_scf:LE_scf                        ***
## H_scf:co2_scf                          
## H_scf:h2o_scf                          
## H_scf:x_peak                           
## LE_scf:co2_scf                      ***
## LE_scf:h2o_scf                         
## LE_scf:x_peak                       ***
## co2_scf:h2o_scf                        
## co2_scf:x_peak                         
## h2o_scf:x_peak                         
## H_scf:LE_scf:co2_scf                ***
## H_scf:LE_scf:h2o_scf                   
## H_scf:LE_scf:x_peak                 ***
## H_scf:co2_scf:h2o_scf                  
## H_scf:co2_scf:x_peak                   
## H_scf:h2o_scf:x_peak                   
## LE_scf:co2_scf:h2o_scf              ***
## LE_scf:co2_scf:x_peak               ***
## LE_scf:h2o_scf:x_peak                  
## co2_scf:h2o_scf:x_peak                 
## H_scf:LE_scf:co2_scf:h2o_scf           
## H_scf:LE_scf:co2_scf:x_peak         ***
## H_scf:LE_scf:h2o_scf:x_peak            
## H_scf:co2_scf:h2o_scf:x_peak           
## LE_scf:co2_scf:h2o_scf:x_peak       *  
## H_scf:LE_scf:co2_scf:h2o_scf:x_peak *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 209.2 on 3479 degrees of freedom
##   (872 observations deleted due to missingness)
## Multiple R-squared:  0.9253, Adjusted R-squared:  0.925 
## F-statistic:  2873 on 15 and 3479 DF,  p-value: < 2.2e-16
model2 = lm(x_70.~ H_scf + H_scf:LE_scf + LE_scf:co2_scf +LE_scf:x_peak + LE_scf:co2_scf:x_peak +H_scf:LE_scf:co2_scf:h2o_scf, data = summer_data)
summary(model2)
## 
## Call:
## lm(formula = x_70. ~ H_scf + H_scf:LE_scf + LE_scf:co2_scf + 
##     LE_scf:x_peak + LE_scf:co2_scf:x_peak + H_scf:LE_scf:co2_scf:h2o_scf, 
##     data = summer_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4068.7  -120.4   -91.1    80.6  6682.4 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  -1.494e+03  9.740e+02  -1.534  0.12516    
## H_scf                         1.265e+03  1.104e+03   1.146  0.25186    
## H_scf:LE_scf                  1.926e+02  1.809e+02   1.065  0.28704    
## LE_scf:co2_scf               -5.049e+01  3.105e+01  -1.626  0.10408    
## LE_scf:x_peak                 5.959e+00  8.235e-02  72.365  < 2e-16 ***
## LE_scf:co2_scf:x_peak        -7.138e-01  1.439e-02 -49.590  < 2e-16 ***
## H_scf:LE_scf:co2_scf:h2o_scf  8.727e-01  3.020e-01   2.890  0.00388 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 332 on 3488 degrees of freedom
##   (872 observations deleted due to missingness)
## Multiple R-squared:  0.8115, Adjusted R-squared:  0.8112 
## F-statistic:  2503 on 6 and 3488 DF,  p-value: < 2.2e-16
model3  = lm(x_70.~ H_scf, data = summer_data)
anova(model2, model1)
## Analysis of Variance Table
## 
## Model 1: x_70. ~ H_scf + H_scf:LE_scf + LE_scf:co2_scf + LE_scf:x_peak + 
##     LE_scf:co2_scf:x_peak + H_scf:LE_scf:co2_scf:h2o_scf
## Model 2: x_70. ~ (H_scf + LE_scf + co2_scf + h2o_scf + x_peak)^5
##   Res.Df       RSS Df Sum of Sq      F    Pr(>F)    
## 1   3488 384360898                                  
## 2   3479 152323466  9 232037432 588.85 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cor(summer_data$H_scf, summer_data$LE_scf,use = "na.or.complete")
## [1] 0.9823915
# Multicollinearity test
# For example you have left columns a,b,c,d,e,f
# you need to produce 
# data2 = data.frame(a = data$a,b = data$b,c = data$c,d = data$d,e = data$e,f = data$f)
# cor(data2, use = "na.or.complete")