Calculate mean for every column of iris data frame, except the “Species” one and combine result into list
Calculate mean for each row excluding data from column “Species” and show it as a vector
Create a random DNA 1000 nucleotides long, calculate amount of A and T in it and write it into named vector ‘dna_at’
Generate a random set of alphabet letters, 10000 characters long and count the number of vowels.
Sort all species in iris table by it’s mean sepal length. The result should be a vector with the species names in correct order
Write a function to calculate vector median
Build graphs of sepal length dependence from petal length for each of dataframe iris species (for each spiecies treated as factor)
Using data from dataframe ‘diamonds’(appears after enabling ggplot2 package) calculate mean price per carat for each category of clarity
Using data obtained from the next link, build the best fitted models for chosen variables for given period of time:
#better use code from Import Dataset Wizard
library(readr)
## Warning: package 'readr' was built under R version 3.3.2
eddypro <- read_csv("C:/Users/iaros/YandexDisk/Jobs/EDDY/environmental-modelling.github.io/eddypro.csv", skip = 1)
## Warning: Duplicated column names deduplicated: 'skweness_kurtosis' =>
## 'skweness_kurtosis_1' [93], 'discontinuities' => 'discontinuities_1' [95],
## 'timelag' => 'timelag_1' [97], 'co2' => 'co2_1' [127], 'h2o' =>
## 'h2o_1' [128]
## Parsed with column specification:
## cols(
## .default = col_character()
## )
## See spec(...) for full column specifications.
table_names = names(eddypro)
data <- read_csv("C:/Users/iaros/YandexDisk/Jobs/EDDY/environmental-modelling.github.io/eddypro.csv", skip = 3, col_names = table_names)
## Parsed with column specification:
## cols(
## .default = col_double(),
## filename = col_character(),
## date = col_date(format = ""),
## time = col_time(format = ""),
## daytime = col_character(),
## co2_def_timelag = col_character(),
## h2o_def_timelag = col_character(),
## model = col_character(),
## spikes = col_character(),
## amplitude_resolution = col_character(),
## drop_out = col_character(),
## absolute_limits = col_character(),
## skweness_kurtosis = col_character(),
## skweness_kurtosis_1 = col_character(),
## discontinuities = col_character(),
## discontinuities_1 = col_character(),
## timelag = col_character(),
## timelag_1 = col_character(),
## attack_angle = col_character(),
## non_steady_wind = col_character()
## )
## See spec(...) for full column specifications.
data[data == -9999] = NA
summer_data = data.frame(data[data$DOY > 150 & data$DOY < 241,])
#summer_data$`x_70%`
cor_vector = c()
names(summer_data)
## [1] "filename" "date"
## [3] "time" "DOY"
## [5] "daytime" "file_records"
## [7] "used_records" "Tau"
## [9] "qc_Tau" "rand_err_Tau"
## [11] "H" "qc_H"
## [13] "rand_err_H" "LE"
## [15] "qc_LE" "rand_err_LE"
## [17] "co2_flux" "qc_co2_flux"
## [19] "rand_err_co2_flux" "h2o_flux"
## [21] "qc_h2o_flux" "rand_err_h2o_flux"
## [23] "H_strg" "LE_strg"
## [25] "co2_strg" "h2o_strg"
## [27] "co2_v.adv" "h2o_v.adv"
## [29] "co2_molar_density" "co2_mole_fraction"
## [31] "co2_mixing_ratio" "co2_time_lag"
## [33] "co2_def_timelag" "h2o_molar_density"
## [35] "h2o_mole_fraction" "h2o_mixing_ratio"
## [37] "h2o_time_lag" "h2o_def_timelag"
## [39] "sonic_temperature" "air_temperature"
## [41] "air_pressure" "air_density"
## [43] "air_heat_capacity" "air_molar_volume"
## [45] "water_vapor_density" "e"
## [47] "es" "specific_humidity"
## [49] "RH" "VPD"
## [51] "Tdew" "u_unrot"
## [53] "v_unrot" "w_unrot"
## [55] "u_rot" "v_rot"
## [57] "w_rot" "wind_speed"
## [59] "max_speed" "wind_dir"
## [61] "yaw" "pitch"
## [63] "roll" "u."
## [65] "TKE" "L"
## [67] "X.z.d..L" "bowen_ratio"
## [69] "T." "model"
## [71] "x_peak" "x_offset"
## [73] "x_10." "x_30."
## [75] "x_50." "x_70."
## [77] "x_90." "un_Tau"
## [79] "Tau_scf" "un_H"
## [81] "H_scf" "un_LE"
## [83] "LE_scf" "un_co2_flux"
## [85] "co2_scf" "un_h2o_flux"
## [87] "h2o_scf" "spikes"
## [89] "amplitude_resolution" "drop_out"
## [91] "absolute_limits" "skweness_kurtosis"
## [93] "skweness_kurtosis_1" "discontinuities"
## [95] "discontinuities_1" "timelag"
## [97] "timelag_1" "attack_angle"
## [99] "non_steady_wind" "u_spikes"
## [101] "v_spikes" "w_spikes"
## [103] "ts_spikes" "co2_spikes"
## [105] "h2o_spikes" "head_detect"
## [107] "t_out" "t_in"
## [109] "aux_in" "delta_p"
## [111] "chopper" "detector"
## [113] "pll" "sync"
## [115] "mean_value" "u_var"
## [117] "v_var" "w_var"
## [119] "ts_var" "co2_var"
## [121] "h2o_var" "w.ts_cov"
## [123] "w.co2_cov" "w.h2o_cov"
## [125] "co2" "h2o"
## [127] "co2_1" "h2o_1"
## [129] "co2_signal_strength_7200" "h2o_signal_strength_7200"
## [131] "flowrate"
summer_data = summer_data[,c(-1,-2,-3,-5, -33,-38,-70, -88:-131)]
cor_vector = c()
for (i in 1:length(names(summer_data))){
x = summer_data[,i]
y = summer_data$x_70.
cor_vector = c(cor_vector, cor(x,y,use = "na.or.complete"))
}
## Warning in cor(x, y, use = "na.or.complete"): стандартное отклонение
## нулевое
## Warning in cor(x, y, use = "na.or.complete"): стандартное отклонение
## нулевое
## Warning in cor(x, y, use = "na.or.complete"): стандартное отклонение
## нулевое
## Warning in cor(x, y, use = "na.or.complete"): стандартное отклонение
## нулевое
names(cor_vector) = names(summer_data)
cor_vector[cor_vector^2 > .1]
## <NA> <NA> <NA> <NA> <NA> u.
## NA NA NA NA NA -0.3473370
## x_peak x_offset x_10. x_30. x_50. x_70.
## 0.9250022 0.9381630 0.9709691 0.9895082 0.9965494 1.0000000
## x_90. Tau_scf H_scf LE_scf co2_scf h2o_scf
## 0.7003597 0.3659544 0.4575340 0.3903480 0.3903480 0.3903480
# H_scf LE_scf co2_scf h2o_scf x_peak
model1 = lm(x_70.~(H_scf+LE_scf+co2_scf+h2o_scf+x_peak)^5, data = summer_data)
summary(model1)
##
## Call:
## lm(formula = x_70. ~ (H_scf + LE_scf + co2_scf + h2o_scf + x_peak)^5,
## data = summer_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1264.9 -74.3 -35.8 77.6 6638.1
##
## Coefficients: (16 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.950e+04 7.460e+03 -3.954 7.83e-05
## H_scf 2.907e+04 8.526e+03 3.409 0.000658
## LE_scf 3.057e+04 5.712e+03 5.353 9.23e-08
## co2_scf NA NA NA NA
## h2o_scf NA NA NA NA
## x_peak -3.460e+01 4.100e+01 -0.844 0.398777
## H_scf:LE_scf -3.183e+04 7.805e+03 -4.078 4.65e-05
## H_scf:co2_scf NA NA NA NA
## H_scf:h2o_scf NA NA NA NA
## H_scf:x_peak 1.239e+01 4.679e+01 0.265 0.791235
## LE_scf:co2_scf -5.517e+03 8.716e+02 -6.330 2.76e-10
## LE_scf:h2o_scf NA NA NA NA
## LE_scf:x_peak 1.699e+02 2.034e+01 8.351 < 2e-16
## co2_scf:h2o_scf NA NA NA NA
## co2_scf:x_peak NA NA NA NA
## h2o_scf:x_peak NA NA NA NA
## H_scf:LE_scf:co2_scf 7.513e+03 7.481e+02 10.043 < 2e-16
## H_scf:LE_scf:h2o_scf NA NA NA NA
## H_scf:LE_scf:x_peak -1.486e+02 2.885e+01 -5.149 2.76e-07
## H_scf:co2_scf:h2o_scf NA NA NA NA
## H_scf:co2_scf:x_peak NA NA NA NA
## H_scf:h2o_scf:x_peak NA NA NA NA
## LE_scf:co2_scf:h2o_scf -5.705e+02 1.328e+02 -4.296 1.78e-05
## LE_scf:co2_scf:x_peak -2.289e+01 3.374e+00 -6.783 1.38e-11
## LE_scf:h2o_scf:x_peak NA NA NA NA
## co2_scf:h2o_scf:x_peak NA NA NA NA
## H_scf:LE_scf:co2_scf:h2o_scf -7.849e+00 2.380e+01 -0.330 0.741546
## H_scf:LE_scf:co2_scf:x_peak 3.062e+01 4.537e+00 6.750 1.72e-11
## H_scf:LE_scf:h2o_scf:x_peak NA NA NA NA
## H_scf:co2_scf:h2o_scf:x_peak NA NA NA NA
## LE_scf:co2_scf:h2o_scf:x_peak -1.828e+00 7.677e-01 -2.381 0.017327
## H_scf:LE_scf:co2_scf:h2o_scf:x_peak -3.183e-01 1.416e-01 -2.247 0.024693
##
## (Intercept) ***
## H_scf ***
## LE_scf ***
## co2_scf
## h2o_scf
## x_peak
## H_scf:LE_scf ***
## H_scf:co2_scf
## H_scf:h2o_scf
## H_scf:x_peak
## LE_scf:co2_scf ***
## LE_scf:h2o_scf
## LE_scf:x_peak ***
## co2_scf:h2o_scf
## co2_scf:x_peak
## h2o_scf:x_peak
## H_scf:LE_scf:co2_scf ***
## H_scf:LE_scf:h2o_scf
## H_scf:LE_scf:x_peak ***
## H_scf:co2_scf:h2o_scf
## H_scf:co2_scf:x_peak
## H_scf:h2o_scf:x_peak
## LE_scf:co2_scf:h2o_scf ***
## LE_scf:co2_scf:x_peak ***
## LE_scf:h2o_scf:x_peak
## co2_scf:h2o_scf:x_peak
## H_scf:LE_scf:co2_scf:h2o_scf
## H_scf:LE_scf:co2_scf:x_peak ***
## H_scf:LE_scf:h2o_scf:x_peak
## H_scf:co2_scf:h2o_scf:x_peak
## LE_scf:co2_scf:h2o_scf:x_peak *
## H_scf:LE_scf:co2_scf:h2o_scf:x_peak *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 209.2 on 3479 degrees of freedom
## (872 observations deleted due to missingness)
## Multiple R-squared: 0.9253, Adjusted R-squared: 0.925
## F-statistic: 2873 on 15 and 3479 DF, p-value: < 2.2e-16
model2 = lm(x_70.~ H_scf + H_scf:LE_scf + LE_scf:co2_scf +LE_scf:x_peak + LE_scf:co2_scf:x_peak +H_scf:LE_scf:co2_scf:h2o_scf, data = summer_data)
summary(model2)
##
## Call:
## lm(formula = x_70. ~ H_scf + H_scf:LE_scf + LE_scf:co2_scf +
## LE_scf:x_peak + LE_scf:co2_scf:x_peak + H_scf:LE_scf:co2_scf:h2o_scf,
## data = summer_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4068.7 -120.4 -91.1 80.6 6682.4
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.494e+03 9.740e+02 -1.534 0.12516
## H_scf 1.265e+03 1.104e+03 1.146 0.25186
## H_scf:LE_scf 1.926e+02 1.809e+02 1.065 0.28704
## LE_scf:co2_scf -5.049e+01 3.105e+01 -1.626 0.10408
## LE_scf:x_peak 5.959e+00 8.235e-02 72.365 < 2e-16 ***
## LE_scf:co2_scf:x_peak -7.138e-01 1.439e-02 -49.590 < 2e-16 ***
## H_scf:LE_scf:co2_scf:h2o_scf 8.727e-01 3.020e-01 2.890 0.00388 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 332 on 3488 degrees of freedom
## (872 observations deleted due to missingness)
## Multiple R-squared: 0.8115, Adjusted R-squared: 0.8112
## F-statistic: 2503 on 6 and 3488 DF, p-value: < 2.2e-16
model3 = lm(x_70.~ H_scf, data = summer_data)
anova(model2, model1)
## Analysis of Variance Table
##
## Model 1: x_70. ~ H_scf + H_scf:LE_scf + LE_scf:co2_scf + LE_scf:x_peak +
## LE_scf:co2_scf:x_peak + H_scf:LE_scf:co2_scf:h2o_scf
## Model 2: x_70. ~ (H_scf + LE_scf + co2_scf + h2o_scf + x_peak)^5
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 3488 384360898
## 2 3479 152323466 9 232037432 588.85 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cor(summer_data$H_scf, summer_data$LE_scf,use = "na.or.complete")
## [1] 0.9823915
# Multicollinearity test
# For example you have left columns a,b,c,d,e,f
# you need to produce
# data2 = data.frame(a = data$a,b = data$b,c = data$c,d = data$d,e = data$e,f = data$f)
# cor(data2, use = "na.or.complete")