Require the libraries of this class:
## install pacman
if(!require(pacman)) install.packages("pacman") ; require(pacman)## Loading required package: pacman
## require/install packages on this session
require(pacman)
p_load(rio, # import/export data
tidyverse, # tidy-data
skimr, # summary data
caret) # Classification And REgression Training
## set seed
set.seed(0000)First we are going to import some part of the GEIH data that will be used in the problem-set 1. The data dictionary is available here
## load data
db <- import("https://gitlab.com/Lectures-R/bd-meca-2022-summer/lecture-01/-/raw/main/data/GEIH_sample1.Rds")Wha is a tibble?: Tibbles are a modern take on data frames. They keep the features that have stood the test of time, and drop the features that used to be convenient but are now frustrating (i.e. converting character vectors to factors).
Then, Why should I use them?: There are three key
differences between tibbles and data frames: printing, subsetting, and
recycling rules. Run vignette("tibble") on the console to
read more about it.
db <- as_tibble(db) ## from dataframe to tibbleYou can print in console both first or last database observations:
## print data
head(db)## # A tibble: 6 × 178
## Var.1 directorio secuencia_p orden clase dominio mes estrato1 sex age
## <int> <int> <int> <int> <int> <chr> <int> <int> <int> <int>
## 1 1 4514331 1 2 1 BOGOTA 1 2 0 29
## 2 2 4514331 1 1 1 BOGOTA 1 2 1 36
## 3 3 4514332 1 4 1 BOGOTA 1 2 1 4
## 4 4 4514332 1 3 1 BOGOTA 1 2 1 7
## 5 5 4514332 1 1 1 BOGOTA 1 2 0 32
## 6 6 4514332 1 2 1 BOGOTA 1 2 1 35
## # … with 168 more variables: p6050 <int>, p6090 <int>, p6100 <int>,
## # p6210 <int>, p6210s1 <int>, p6240 <int>, oficio <int>, p6426 <int>,
## # relab <int>, p6500 <dbl>, p6510 <int>, p6510s1 <dbl>, p6510s2 <int>,
## # p6545 <int>, p6545s1 <dbl>, p6545s2 <int>, p6580 <int>, p6580s1 <dbl>,
## # p6580s2 <int>, p6585s1 <int>, p6585s1a1 <dbl>, p6585s1a2 <int>,
## # p6585s2 <int>, p6585s2a1 <dbl>, p6585s2a2 <int>, p6585s3 <int>,
## # p6585s3a1 <dbl>, p6585s3a2 <int>, p6585s4 <int>, p6585s4a1 <dbl>, …
tail(db)## # A tibble: 6 × 178
## Var.1 directorio secuencia_p orden clase dominio mes estrato1 sex age
## <int> <int> <int> <int> <int> <chr> <int> <int> <int> <int>
## 1 3212 4540692 1 6 1 BOGOTA 2 2 0 12
## 2 3213 4540693 1 1 1 BOGOTA 2 3 0 50
## 3 3214 4540693 1 2 1 BOGOTA 2 3 0 22
## 4 3215 4540693 1 3 1 BOGOTA 2 3 0 15
## 5 3216 4540694 1 3 1 BOGOTA 2 2 0 7
## 6 3217 4540694 1 1 1 BOGOTA 2 2 1 56
## # … with 168 more variables: p6050 <int>, p6090 <int>, p6100 <int>,
## # p6210 <int>, p6210s1 <int>, p6240 <int>, oficio <int>, p6426 <int>,
## # relab <int>, p6500 <dbl>, p6510 <int>, p6510s1 <dbl>, p6510s2 <int>,
## # p6545 <int>, p6545s1 <dbl>, p6545s2 <int>, p6580 <int>, p6580s1 <dbl>,
## # p6580s2 <int>, p6585s1 <int>, p6585s1a1 <dbl>, p6585s1a2 <int>,
## # p6585s2 <int>, p6585s2a1 <dbl>, p6585s2a2 <int>, p6585s3 <int>,
## # p6585s3a1 <dbl>, p6585s3a2 <int>, p6585s4 <int>, p6585s4a1 <dbl>, …
You can get a complete description of the database using the
skim() function of the skimr library. However,
you can to produce result summaries of the a variable using
summary() function. Let me show you a example:
## summary db
skim(db) %>% head()| Name | db |
| Number of rows | 3217 |
| Number of columns | 178 |
| _______________________ | |
| Column type frequency: | |
| character | 1 |
| logical | 2 |
| numeric | 3 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| dominio | 0 | 1 | 6 | 6 | 0 | 1 | 0 |
Variable type: logical
| skim_variable | n_missing | complete_rate | mean | count |
|---|---|---|---|---|
| p550 | 3217 | 0 | NaN | : |
| y_gananciaNetaAgro_m | 3217 | 0 | NaN | : |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| Var.1 | 0 | 1 | 1609.00 | 928.81 | 1 | 805 | 1609 | 2413 | 3217 | ▇▇▇▇▇ |
| directorio | 0 | 1 | 4527886.71 | 9151.40 | 4514331 | 4521292 | 4526845 | 4535725 | 4540694 | ▃▅▅▁▇ |
| secuencia_p | 0 | 1 | 1.02 | 0.18 | 1 | 1 | 1 | 1 | 4 | ▇▁▁▁▁ |
## summary var
summary(db$y_salary_m)## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 40000 737717 860000 1383727 1300000 30000000 2286
## data + mapping
ggplot(data = db , mapping = aes(x = age , y = y_ingLab_m))## + geometry
ggplot(data = db , mapping = aes(x = age , y = y_ingLab_m)) +
geom_point(col = "red" , size = 0.5)## Warning: Removed 2286 rows containing missing values (geom_point).
## by group
ggplot(data = db ,
mapping = aes(x = age , y = y_ingLab_m , group=as.factor(formal) , color=as.factor(formal))) +
geom_point()## Warning: Removed 2286 rows containing missing values (geom_point).
You can save the chart in a object:
## density: income by sex
p <- ggplot(data=db) +
geom_histogram(mapping = aes(x=y_ingLab_m , group=as.factor(sex) , fill=as.factor(sex)))
p## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2286 rows containing non-finite values (stat_bin).
And you can add attributes to the object that contains the chart:
p + scale_fill_manual(values = c("0"="red" , "1"="blue") , label = c("0"="Hombre" , "1"="Mujer") , name = "Sexo")## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2286 rows containing non-finite values (stat_bin).
Boxplot chart:
## box_plot: estrato1 vs totalHoursWorked
box_plot <- ggplot(data=db , mapping = aes(as.factor(estrato1) , totalHoursWorked)) +
geom_boxplot()
box_plot## Warning: Removed 1652 rows containing non-finite values (stat_boxplot).
## add another geometry
box_plot <- box_plot +
geom_point(aes(colour=as.factor(sex))) +
scale_color_manual(values = c("0"="red" , "1"="blue") , label = c("0"="Hombre" , "1"="Mujer") , name = "Sexo")
box_plot## Warning: Removed 1652 rows containing non-finite values (stat_boxplot).
## Warning: Removed 1652 rows containing missing values (geom_point).
## add theme
box_plot + theme_test()## Warning: Removed 1652 rows containing non-finite values (stat_boxplot).
## Warning: Removed 1652 rows containing missing values (geom_point).
h_hour = ggplot() + geom_histogram(data=db , aes(x=hoursWorkUsual) , fill="#99FF33" , alpha=0.5)
h_hour## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1652 rows containing non-finite values (stat_bin).
db = db %>% mutate(esc_hoursWorkUsual = scale(hoursWorkUsual))
h_hour + geom_histogram(data=db , aes(x=esc_hoursWorkUsual) , fill="#FF0066" , alpha=0.5)## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1652 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1652 rows containing non-finite values (stat_bin).
BoxCoxTrans(db$y_ingLab_m)Aaaa! We’ll deal with NAs in a while, let’s avoid them for now!
BoxCoxTrans(db$y_ingLab_m , na.rm=T)## Box-Cox Transformation
##
## 931 data points used to estimate Lambda
##
## Input data summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 40000 820857 983140 1547069 1498083 40000000
##
## Largest/Smallest: 1000
## Sample Skewness: 8.63
##
## Estimated Lambda: -0.1
## With fudge factor, Lambda = 0 will be used for transformations
ggplot() + geom_boxplot(data=db ,aes(x=y_ingLab_m) , fill="darkblue" , alpha=0.5)## Warning: Removed 2286 rows containing non-finite values (stat_boxplot).
db <- db %>% mutate(log_ingLab_m=log(y_ingLab_m))
ggplot() + geom_histogram(data=db , aes(x=log_ingLab_m) , fill="coral1" , alpha=0.5)## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2286 rows containing non-finite values (stat_bin).
quantile(x=db$p6426 , na.rm=T)## 0% 25% 50% 75% 100%
## 0 8 30 84 600
IQR(x=db$p6426 , na.rm=T)## [1] 76
iqr <- IQR(x=db$p6426 , na.rm=T)
db_out <- db %>% subset(p6426 <= 2*iqr | is.na(p6426)==T)
cat("¡Elimina las NA!")## ¡Elimina las NA!
quantile(x=db_out$p6426 , na.rm=T)## 0% 25% 50% 75% 100%
## 0 6 24 56 148
nrow(db) - nrow(db_out)## [1] 208
db = db %>%
mutate(p6426_out = ifelse(test = p6426 > 4*iqr,
yes = 1,
no = 0))
table(db$p6426_out)##
## 0 1
## 1499 66
q = quantile(db$p6426 , na.rm=T)
q## 0% 25% 50% 75% 100%
## 0 8 30 84 600
db = db %>%
mutate(p6426_q = case_when(p6426 < q[2] ~ "Q-1",
p6426 >= q[2] & p6426 < q[3] ~ "Q-2",
p6426 >= q[3] & p6426 < q[4] ~ "Q-3",
p6426 >= q[4] ~ "Q-4"))
table(db$p6426_q)##
## Q-1 Q-2 Q-3 Q-4
## 375 404 376 410
## select: delete a variable
head(db)## # A tibble: 6 × 182
## Var.1 directorio secuencia_p orden clase dominio mes estrato1 sex age
## <int> <int> <int> <int> <int> <chr> <int> <int> <int> <int>
## 1 1 4514331 1 2 1 BOGOTA 1 2 0 29
## 2 2 4514331 1 1 1 BOGOTA 1 2 1 36
## 3 3 4514332 1 4 1 BOGOTA 1 2 1 4
## 4 4 4514332 1 3 1 BOGOTA 1 2 1 7
## 5 5 4514332 1 1 1 BOGOTA 1 2 0 32
## 6 6 4514332 1 2 1 BOGOTA 1 2 1 35
## # … with 172 more variables: p6050 <int>, p6090 <int>, p6100 <int>,
## # p6210 <int>, p6210s1 <int>, p6240 <int>, oficio <int>, p6426 <int>,
## # relab <int>, p6500 <dbl>, p6510 <int>, p6510s1 <dbl>, p6510s2 <int>,
## # p6545 <int>, p6545s1 <dbl>, p6545s2 <int>, p6580 <int>, p6580s1 <dbl>,
## # p6580s2 <int>, p6585s1 <int>, p6585s1a1 <dbl>, p6585s1a2 <int>,
## # p6585s2 <int>, p6585s2a1 <dbl>, p6585s2a2 <int>, p6585s3 <int>,
## # p6585s3a1 <dbl>, p6585s3a2 <int>, p6585s4 <int>, p6585s4a1 <dbl>, …
db %>% select(-Var.1,-orden)## # A tibble: 3,217 × 180
## directorio secuencia_p clase dominio mes estrato1 sex age p6050 p6090
## <int> <int> <int> <chr> <int> <int> <int> <int> <int> <int>
## 1 4514331 1 1 BOGOTA 1 2 0 29 2 1
## 2 4514331 1 1 BOGOTA 1 2 1 36 1 1
## 3 4514332 1 1 BOGOTA 1 2 1 4 3 NA
## 4 4514332 1 1 BOGOTA 1 2 1 7 3 NA
## 5 4514332 1 1 BOGOTA 1 2 0 32 1 1
## 6 4514332 1 1 BOGOTA 1 2 1 35 2 1
## 7 4514333 1 1 BOGOTA 1 2 1 18 3 1
## 8 4514333 1 1 BOGOTA 1 2 0 49 2 1
## 9 4514333 1 1 BOGOTA 1 2 1 51 1 1
## 10 4514333 1 1 BOGOTA 1 2 0 23 3 1
## # … with 3,207 more rows, and 170 more variables: p6100 <int>, p6210 <int>,
## # p6210s1 <int>, p6240 <int>, oficio <int>, p6426 <int>, relab <int>,
## # p6500 <dbl>, p6510 <int>, p6510s1 <dbl>, p6510s2 <int>, p6545 <int>,
## # p6545s1 <dbl>, p6545s2 <int>, p6580 <int>, p6580s1 <dbl>, p6580s2 <int>,
## # p6585s1 <int>, p6585s1a1 <dbl>, p6585s1a2 <int>, p6585s2 <int>,
## # p6585s2a1 <dbl>, p6585s2a2 <int>, p6585s3 <int>, p6585s3a1 <dbl>,
## # p6585s3a2 <int>, p6585s4 <int>, p6585s4a1 <dbl>, p6585s4a2 <int>, …
## select variable: by patter name
db %>% select(starts_with("p6"))## # A tibble: 3,217 × 53
## p6050 p6090 p6100 p6210 p6210s1 p6240 p6426 p6500 p6510 p6510s1 p6510s2
## <int> <int> <int> <int> <int> <int> <int> <dbl> <int> <dbl> <int>
## 1 2 1 2 5 11 6 NA NA NA NA NA
## 2 1 1 2 5 11 1 166 1300000 2 0 NA
## 3 3 NA NA 2 NA NA NA NA NA NA NA
## 4 3 NA NA 3 NA NA NA NA NA NA NA
## 5 1 1 1 4 7 4 NA NA NA NA NA
## 6 2 1 1 5 11 2 NA NA NA NA NA
## 7 3 1 1 6 2 3 NA NA NA NA NA
## 8 2 1 1 5 11 6 NA NA NA NA NA
## 9 1 1 1 6 3 1 12 1200000 2 0 NA
## 10 3 1 1 6 5 6 NA NA NA NA NA
## # … with 3,207 more rows, and 42 more variables: p6545 <int>, p6545s1 <dbl>,
## # p6545s2 <int>, p6580 <int>, p6580s1 <dbl>, p6580s2 <int>, p6585s1 <int>,
## # p6585s1a1 <dbl>, p6585s1a2 <int>, p6585s2 <int>, p6585s2a1 <dbl>,
## # p6585s2a2 <int>, p6585s3 <int>, p6585s3a1 <dbl>, p6585s3a2 <int>,
## # p6585s4 <int>, p6585s4a1 <dbl>, p6585s4a2 <int>, p6590 <int>,
## # p6590s1 <dbl>, p6600 <int>, p6600s1 <dbl>, p6610 <int>, p6610s1 <dbl>,
## # p6620 <int>, p6620s1 <dbl>, p6630s1 <int>, p6630s1a1 <dbl>, …
db %>% select(directorio,contains("salary"))## # A tibble: 3,217 × 4
## directorio y_salary_m y_salary_m_hu y_salarySec_m
## <int> <dbl> <dbl> <dbl>
## 1 4514331 NA NA NA
## 2 4514331 1300000 6741. NA
## 3 4514332 NA NA NA
## 4 4514332 NA NA NA
## 5 4514332 NA NA NA
## 6 4514332 NA NA NA
## 7 4514333 NA NA NA
## 8 4514333 NA NA NA
## 9 4514333 1200000 5833. NA
## 10 4514333 NA NA NA
## # … with 3,207 more rows
## select variable: by class
db %>% select_if(is.character)## # A tibble: 3,217 × 2
## dominio p6426_q
## <chr> <chr>
## 1 BOGOTA <NA>
## 2 BOGOTA Q-4
## 3 BOGOTA <NA>
## 4 BOGOTA <NA>
## 5 BOGOTA <NA>
## 6 BOGOTA <NA>
## 7 BOGOTA <NA>
## 8 BOGOTA <NA>
## 9 BOGOTA Q-2
## 10 BOGOTA <NA>
## # … with 3,207 more rows
## get missing values
is.na(db$y_total_m) %>% table()## .
## FALSE TRUE
## 1354 1863
## replace values:
db %>% select(directorio,y_total_m) %>% tail()## # A tibble: 6 × 2
## directorio y_total_m
## <int> <dbl>
## 1 4540692 NA
## 2 4540693 1000000
## 3 4540693 869453
## 4 4540693 NA
## 5 4540694 NA
## 6 4540694 1000000
db = db %>%
group_by(directorio) %>%
mutate(mean_y_total_m = mean(y_total_m,na.rm=T))
db %>% select(directorio,y_total_m,mean_y_total_m) %>% tail()## # A tibble: 6 × 3
## # Groups: directorio [3]
## directorio y_total_m mean_y_total_m
## <int> <dbl> <dbl>
## 1 4540692 NA 1247444.
## 2 4540693 1000000 934726.
## 3 4540693 869453 934726.
## 4 4540693 NA 934726.
## 5 4540694 NA 1000000
## 6 4540694 1000000 1000000
db = db %>%
mutate(y_total_m = ifelse(test = is.na(y_total_m)==T,
yes = mean_y_total_m,
no = y_total_m))
db %>% select(directorio,y_total_m,mean_y_total_m) %>% tail()## # A tibble: 6 × 3
## # Groups: directorio [3]
## directorio y_total_m mean_y_total_m
## <int> <dbl> <dbl>
## 1 4540692 1247444. 1247444.
## 2 4540693 1000000 934726.
## 3 4540693 869453 934726.
## 4 4540693 934726. 934726.
## 5 4540694 1000000 1000000
## 6 4540694 1000000 1000000
## delete observations
db_c = db %>% subset(is.na(y_total_m)==F)
nrow(db) - nrow(db_c)## [1] 486
is.na(db$y_total_m) %>% table()## .
## FALSE TRUE
## 2731 486
db_c = db %>% dplyr::filter(is.na(y_total_m)==F)Colin Gillespie and Robin Lovelace, 2017. Efficient R Programming, A Practical Guide to Smarter Programming [Avaiable here]
Cap. 3: Efficient programming
Cap. 4: Efficient programming
Kuhn, M., & Johnson, K. (2013). Applied predictive modeling (Vol. 26, p. 13). New York: Springer. [Avaiable here]