From: "Gemt af Internet Explorer 11" Subject: Georgia - LFS and donor imputation Date: Wed, 30 Dec 2020 10:34:57 +0100 MIME-Version: 1.0 Content-Type: text/html; charset="utf-8" Content-Transfer-Encoding: quoted-printable Content-Location: file://C:\Temp\wz7fe2\Exploration.html X-MimeOLE: Produced By Microsoft MimeOLE V6.1.7601.24158 =EF=BB=BF =20 =20 =20 = Georgia - LFS and=20 donor imputation=20 =20 =20 =20 =20 =20 =20 =20 =20 =20 =20 =20 =20 =20 =20 =20 =20 =20 =20 =20 =20 =20 =20 =20 =20
=20

Georgia - LFS and donor = imputation

library(tidyverse)=0A=
library(simputation)

Read and prep data

lfs_demo <- haven::read_spss("SPSS - =
Eng-2019/LFS_Demograpics.sav")=0A=
=0A=
lfs_ecstat <- haven::read_spss("SPSS - Eng-2019/LFS_ECSTAT.sav")=0A=
=0A=
lfs_family <- haven::read_spss("SPSS - Eng-2019/LFS_Familysize.sav")=0A=
=0A=
lfs_id <- haven::read_spss("SPSS - Eng-2019/LFS_ID.sav")=0A=
=0A=
lfs_weights <- haven::read_spss("SPSS - =
Eng-2019/LFS_Personal_weights.sav")

No. of observations in each data set:

ls(pattern =3D "lfs_.*") %>% =0A=
  set_names(.) %>% =0A=
  map(get) %>% =0A=
  map_int(nrow)
##    lfs_demo  lfs_ecstat  lfs_family      lfs_id =
lfs_weights =0A=
##       77005       61386       21501       21501       =
61386

Demographics has more observations than weights and ecstat. Item non=20 response? (household member level)

Calculate simple response rate:

nrow(lfs_weights)/nrow(lfs_demo)
## [1] 0.797169

Construct data for analysis:

  • Population: UID=E2=80=99s in LFS_ID
  • Weights on household member level from LFS_WEIGHTS
  • Occupation information from LFS_ECSTAT
  • Only keep certain variables:=20
    • General: year, month, quarterno, uid, memberno, region, = urban_rural,=20 p_weights, sex, age
    • Occupation: active, employed, type of employment, additional = employment=20 questions (b*)
lfs <- lfs_id %>% =0A=
  left_join(lfs_weights, by =3D "UID") %>% =0A=
  left_join(lfs_ecstat, by =3D c("UID", "MemberNo")) %>% =0A=
  rename_with(str_to_lower) %>% =0A=
  select(year, month, quarterno, uid, memberno, region, urban_rural, =
p_weights, sex, age,=0A=
         active, active_soft, employed:not_identified, =
starts_with("b"))

Snapshot of data:

lfs %>% print(n_extra =3D 0)
## # A tibble: 61,386 x 41=0A=
##     year month quarterno   uid memberno   region urban_rural =
p_weights     sex=0A=
##    <dbl> <dbl> <chr>     <dbl>    <dbl> =
<dbl+lb>   <dbl+lbl>     <dbl> <dbl+l>=0A=
##  1  2019     3 91        45405        1 29 [Kak~   1 [Urban]      =
106. 2 [Man]=0A=
##  2  2019     3 91        45405        2 29 [Kak~   1 [Urban]      =
107. 1 [Wom~=0A=
##  3  2019     9 93        45406        1 29 [Kak~   2 [Rural]      =
183. 1 [Wom~=0A=
##  4  2019     1 91        45407        1 29 [Kak~   2 [Rural]      =
234. 1 [Wom~=0A=
##  5  2019     5 92        45408        2 11 [Tbi~   1 [Urban]      =
824. 1 [Wom~=0A=
##  6  2019    12 94        45409        1 11 [Tbi~   1 [Urban]      =
424. 1 [Wom~=0A=
##  7  2019    12 94        45409        2 11 [Tbi~   1 [Urban]      =
443. 2 [Man]=0A=
##  8  2019    12 94        45410        2 11 [Tbi~   1 [Urban]      =
335. 1 [Wom~=0A=
##  9  2019    12 94        45410        1 11 [Tbi~   1 [Urban]      =
349. 2 [Man]=0A=
## 10  2019    12 94        45410        4 11 [Tbi~   1 [Urban]      =
335. 1 [Wom~=0A=
## # ... with 61,376 more rows, and 32 more variables

Which quarters are included:

lfs %>% count(year,quarterno)
## # A tibble: 4 x 3=0A=
##    year quarterno     n=0A=
##   <dbl> <chr>     <int>=0A=
## 1  2019 91        15308=0A=
## 2  2019 92        15555=0A=
## 3  2019 93        15446=0A=
## 4  2019 94        15077

Check total population for 4th quarter:

lfs %>% =0A=
  group_by(quarterno) %>% =0A=
  summarise(n =3D n(), N =3D sum(p_weights), .groups =3D =
"drop")
## # A tibble: 4 x 3=0A=
##   quarterno     n        N=0A=
##   <chr>     <int>    <dbl>=0A=
## 1 91        15308 3029168.=0A=
## 2 92        15555 3043425.=0A=
## 3 93        15446 3044084.=0A=
## 4 94        15077 3031826.
lfs %>% =0A=
  filter(quarterno =3D=3D '94') %>% =0A=
  group_by(quarterno, sex) %>% =0A=
  summarise(n =3D n(), N =3D sum(p_weights), .groups =3D =
"drop")
## # A tibble: 2 x 4=0A=
##   quarterno       sex     n        N=0A=
##   <chr>     <dbl+lbl> <int>    <dbl>=0A=
## 1 94        1 [Woman]  7876 1607467.=0A=
## 2 94        2 [Man]    7201 1424359.
lfs %>% =0A=
  filter(quarterno =3D=3D '94') %>% =0A=
  group_by(quarterno, region) %>% =0A=
  summarise(n =3D n(), N =3D sum(p_weights), .groups =3D =
"drop")
## # A tibble: 11 x 4=0A=
##    quarterno                                 region     n       N=0A=
##    <chr>                                  <dbl+lbl> =
<int>   <dbl>=0A=
##  1 94        11 [Tbilisi]                            1915 937861.=0A=
##  2 94        15 [Adjara A/R]                         1714 268115.=0A=
##  3 94        23 [Guria]                              1028  98022.=0A=
##  4 94        26 [Imereti]                            1465 430964.=0A=
##  5 94        29 [Kakheti]                            1469 241949.=0A=
##  6 94        32 [Mtskheta-Mtianeti]                   857  80163.=0A=
##  7 94        35 [Racha-Lechkhumi and Kvemo Svaneti]   808  26989.=0A=
##  8 94        38 [Samegrelo and Zemo Svaneti]         1568 260699.=0A=
##  9 94        41 [Samtskhe-Javakheti]                 1164 134792.=0A=
## 10 94        44 [Kvemo Kartli]                       1453 331493.=0A=
## 11 94        47 [Shida Kartli]                       1636 =
220777.
lfs %>% =0A=
  filter(quarterno =3D=3D '94') %>% =0A=
  group_by(quarterno, employed) %>% =0A=
  summarise(n =3D n(), N =3D sum(p_weights), .groups =3D =
"drop")
## # A tibble: 2 x 4=0A=
##   quarterno  employed     n        N=0A=
##   <chr>     <dbl+lbl> <int>    <dbl>=0A=
## 1 94          0 [No]   5978 1365316.=0A=
## 2 94          1 [Yes]  9099 1666509.

Non-response

First I calculate the overall response matrix R:

R <- lfs %>% =0A=
  select(-year,-month) %>% =0A=
  mutate(across(.cols =3D -c(1:6), .fns =3D negate(is.na))) %>% =0A=
  mutate(across(.cols =3D -c(1:6), .fns =3D as.numeric))=0A=
=0A=
R %>% select(-c(1:6)) %>% print(n_extra =3D 0)
## # A tibble: 61,386 x 33=0A=
##      sex   age active active_soft employed hired self_employed =
not_identified=0A=
##    <dbl> <dbl>  <dbl>       <dbl>    =
<dbl> <dbl>         <dbl>          <dbl>=0A=
##  1     1     1      1           1        1     1             1        =
      1=0A=
##  2     1     1      1           1        1     1             1        =
      1=0A=
##  3     1     1      1           1        1     1             1        =
      1=0A=
##  4     1     1      1           1        1     1             1        =
      1=0A=
##  5     1     1      1           1        1     1             1        =
      1=0A=
##  6     1     1      1           1        1     1             1        =
      1=0A=
##  7     1     1      1           1        1     1             1        =
      1=0A=
##  8     1     1      1           1        1     1             1        =
      1=0A=
##  9     1     1      1           1        1     1             1        =
      1=0A=
## 10     1     1      1           1        1     1             1        =
      1=0A=
## # ... with 61,376 more rows, and 25 more variables

Can also calculate per question:

R_j <- R %>% =0A=
  summarise(across(.cols =3D -c(1:6), .fns =3D mean)) %>% =0A=
  pivot_longer(cols =3D everything(), names_to =3D "variable", values_to =
=3D "R_mean")=0A=
=0A=
R_j
## # A tibble: 33 x 2=0A=
##    variable           R_mean=0A=
##    <chr>               <dbl>=0A=
##  1 sex                 1    =0A=
##  2 age                 1    =0A=
##  3 active              1    =0A=
##  4 active_soft         1    =0A=
##  5 employed            1    =0A=
##  6 hired               1    =0A=
##  7 self_employed       1    =0A=
##  8 not_identified      1    =0A=
##  9 b4_nace_1           0.608=0A=
## 10 brunch_converted_1  0.608=0A=
## # ... with 23 more rows
R_j %>% arrange(R_mean)
## # A tibble: 33 x 2=0A=
##    variable                                  R_mean=0A=
##    <chr>                                      <dbl>=0A=
##  1 b26_97_type_of_transport                 0.00453=0A=
##  2 b26_5_type_of_transport                  0.00681=0A=
##  3 b19_number_of_employees_in_establishment 0.00826=0A=
##  4 b26_3_type_of_transport                  0.0172 =0A=
##  5 b10_work_duration                        0.0202 =0A=
##  6 b26_2_type_of_transport                  0.0291 =0A=
##  7 b16_persons_under_supervision            0.0310 =0A=
##  8 b26_4_type_of_transport                  0.0966 =0A=
##  9 b26_1_type_of_transport                  0.0972 =0A=
## 10 b8_permanency_of_the_job                 0.259  =0A=
## # ... with 23 more rows

Clear signs of routing I=E2=80=99m not catching right now! Look at = the interviews=20 instead:

R_i <- R %>% =0A=
  group_by(across(1:6)) %>% =0A=
  pivot_longer(cols =3D -group_cols(), names_to =3D "variable", =
values_to =3D "R") %>% =0A=
  summarise(R_mean =3D mean(R), .groups =3D "drop")=0A=
=0A=
R_i
## # A tibble: 61,386 x 7=0A=
##    quarterno   uid memberno                  region urban_rural =
p_weights R_mean=0A=
##    <chr>     <dbl>    <dbl>               =
<dbl+lbl>   <dbl+lbl>     <dbl>  <dbl>=0A=
##  1 91        45405        1 29 [Kakheti]              1 [Urban]     =
106.   0.242=0A=
##  2 91        45405        2 29 [Kakheti]              1 [Urban]     =
107.   0.242=0A=
##  3 91        45407        1 29 [Kakheti]              2 [Rural]     =
234.   0.606=0A=
##  4 91        45419        1 38 [Samegrelo and Zemo~   1 [Urban]     =
302.   0.242=0A=
##  5 91        45419        2 38 [Samegrelo and Zemo~   1 [Urban]     =
302.   0.242=0A=
##  6 91        45433        1 29 [Kakheti]              1 [Urban]      =
92.4  0.606=0A=
##  7 91        45433        2 29 [Kakheti]              1 [Urban]      =
93.6  0.242=0A=
##  8 91        45433        3 29 [Kakheti]              1 [Urban]      =
92.5  0.242=0A=
##  9 91        45434        1 29 [Kakheti]              2 [Rural]     =
192.   0.242=0A=
## 10 91        45434        2 29 [Kakheti]              2 [Rural]     =
190.   0.242=0A=
## # ... with 61,376 more rows
R_i %>% arrange(desc(R_mean))
## # A tibble: 61,386 x 7=0A=
##    quarterno   uid memberno                  region urban_rural =
p_weights R_mean=0A=
##    <chr>     <dbl>    <dbl>               =
<dbl+lbl>   <dbl+lbl>     <dbl>  <dbl>=0A=
##  1 93        65081        2 23 [Guria]                2 [Rural]      =
95.9  0.818=0A=
##  2 94        48046        6 38 [Samegrelo and Zemo~   1 [Urban]     =
126.   0.818=0A=
##  3 91        48092        1 35 [Racha-Lechkhumi an~   2 [Rural]      =
28.6  0.788=0A=
##  4 91        48686        2 47 [Shida Kartli]         2 [Rural]     =
123.   0.788=0A=
##  5 91        49075        4 38 [Samegrelo and Zemo~   2 [Rural]     =
157.   0.788=0A=
##  6 91        49208        1 47 [Shida Kartli]         2 [Rural]     =
153.   0.788=0A=
##  7 91        50640        1 38 [Samegrelo and Zemo~   2 [Rural]     =
257.   0.788=0A=
##  8 91        51787        1 47 [Shida Kartli]         1 [Urban]     =
204.   0.788=0A=
##  9 91        52480        3 11 [Tbilisi]              1 [Urban]     =
257.   0.788=0A=
## 10 91        53122        1 38 [Samegrelo and Zemo~   2 [Rural]     =
126.   0.788=0A=
## # ... with 61,376 more rows

What about the NACE levels:

R %>% select(uid, memberno, employed, =
matches("nace"))
## # A tibble: 61,386 x 5=0A=
##      uid memberno employed b4_nace_1 b4_nace_2=0A=
##    <dbl>    <dbl>    <dbl>     <dbl>     =
<dbl>=0A=
##  1 45405        1        1         0         0=0A=
##  2 45405        2        1         0         0=0A=
##  3 45406        1        1         0         0=0A=
##  4 45407        1        1         1         1=0A=
##  5 45408        2        1         0         0=0A=
##  6 45409        1        1         0         0=0A=
##  7 45409        2        1         1         1=0A=
##  8 45410        2        1         1         1=0A=
##  9 45410        1        1         1         1=0A=
## 10 45410        4        1         1         1=0A=
## # ... with 61,376 more rows
lfs %>% count(employed, =
is.na(b4_nace_1))
## # A tibble: 3 x 3=0A=
##    employed `is.na(b4_nace_1)`     n=0A=
##   <dbl+lbl> <lgl>              <int>=0A=
## 1   0 [No]  TRUE               24060=0A=
## 2   1 [Yes] FALSE              37324=0A=
## 3   1 [Yes] TRUE                   2
lfs %>% count(employed, =
is.na(b4_nace_2))
## # A tibble: 3 x 3=0A=
##    employed `is.na(b4_nace_2)`     n=0A=
##   <dbl+lbl> <lgl>              <int>=0A=
## 1   0 [No]  TRUE               24060=0A=
## 2   1 [Yes] FALSE              37324=0A=
## 3   1 [Yes] TRUE                   2

Only 2 non-response for NACE 2 in the entire = year!

lfs %>% =0A=
  filter(employed =3D=3D 1, is.na(b4_nace_1)) %>% =0A=
  select(quarterno, uid, region, p_weights, sex, age, =
matches("nace"))
## # A tibble: 2 x 8=0A=
##   quarterno   uid       region p_weights       sex   age b4_nace_1 =
b4_nace_2=0A=
##   <chr>     <dbl>    <dbl+lbl>     <dbl> =
<dbl+lbl> <dbl>     <dbl>     <dbl>=0A=
## 1 92        50041 11 [Tbilisi]      840.   2 [Man]    29        NA    =
    NA=0A=
## 2 91        50294 11 [Tbilisi]      868.   2 [Man]    29        NA    =
    NA

Analyze employed persons

lfs_empl <- lfs %>% filter(employed =3D=3D =
1)

Special interest in the following variables:

  • Employment type
lfs_empl %>% =
summarise(across(hired:not_identified, sum))
## # A tibble: 1 x 3=0A=
##   hired self_employed not_identified=0A=
##   <dbl>         <dbl>          <dbl>=0A=
## 1 15864         21453              9
  • B4. What kinds of goods or services are mainly provided at your = working=20 place (in your business)?
lfs_empl %>% =
count(is.na(b4_nace_1))
## # A tibble: 2 x 2=0A=
##   `is.na(b4_nace_1)`     n=0A=
##   <lgl>              <int>=0A=
## 1 FALSE              37324=0A=
## 2 TRUE                   2
  • B5. Please specify your occupation at your working place (please = describe=20 as completely as possible):
lfs_empl %>% =
count(is.na(b5_occupation))
## # A tibble: 2 x 2=0A=
##   `is.na(b5_occupation)`     n=0A=
##   <lgl>                  <int>=0A=
## 1 FALSE                  37320=0A=
## 2 TRUE                       6
  • B6. Please define the form of ownership of the enterprise, = organization,=20 economy or business where you were working during the last 7 = days:
lfs_empl %>% count(b6_sector)
## # A tibble: 4 x 2=0A=
##                b6_sector     n=0A=
##                <dbl+lbl> <int>=0A=
## 1  1 [State ownership]    6341=0A=
## 2  2 [Private ownership] 30871=0A=
## 3 97 [Other type]          106=0A=
## 4 98 [Do not know]           8
  • B7. Please define your employment status for the last 7 = days:
lfs_empl %>% count(b7_status)
## # A tibble: 6 x 2=0A=
##                                 b7_status     n=0A=
##                                 <dbl+lbl> <int>=0A=
## 1  1 [Employee]                           15864=0A=
## 2  2 [Employer]                             507=0A=
## 3  3 [Own account worker]                 12583=0A=
## 4  4 [Members of producers=E2=80=99 cooperatives]     9=0A=
## 5  5 [Unpaid family worker]                8354=0A=
## 6 97                                          9
  • B8. Your job was:
lfs_empl %>% =
count(b8_permanency_of_the_job)
## # A tibble: 6 x 2=0A=
##   b8_permanency_of_the_job     n=0A=
##                  <dbl+lbl> <int>=0A=
## 1         1 [Permanent]    14629=0A=
## 2         2 [Temporary]      822=0A=
## 3         3 [Seasonal]       325=0A=
## 4         4 [Odd (Casual)]    51=0A=
## 5        98 [Do not know]     46=0A=
## 6        NA                21453
  • B17/18. Please specify the interval of your net earnings, if it is = hard to=20 say exact amount of you earnings for the last worked month:
lfs_empl %>% =
count(b17_b18_net_earnings)
## # A tibble: 11 x 2=0A=
##       b17_b18_net_earnings     n=0A=
##                  <dbl+lbl> <int>=0A=
##  1  1 [100 GEL or less]      295=0A=
##  2  2 [101-200 GEL]         1487=0A=
##  3  3 [201-400 GEL]         4195=0A=
##  4  4 [401-600 GEL]         3500=0A=
##  5  5 [601-800 GEL]         2166=0A=
##  6  6 [801-1000 GEL]        1413=0A=
##  7  7 [1001-1500 GEL]        892=0A=
##  8  8 [1501-2000 GEL]        233=0A=
##  9  9 [More than 2000 GEL]   137=0A=
## 10 88 [Refuse the answer]   1555=0A=
## 11 NA                      21453
  • B21. Where did you carry out your main job during the last 7 = days?
lfs_empl %>% count(b21_workplace)
## # A tibble: 14 x 2=0A=
##                                                       b21_workplace   =
  n=0A=
##                                                           =
<dbl+lbl> <int>=0A=
##  1  1 [At his/her household dwelling]                                 =
230=0A=
##  2  2 [Client's place]                                               =
1269=0A=
##  3  3 [Formal office]                                                =
9047=0A=
##  4  4 [Factory/atelier/workshop, etc. instead of your living place]  =
2245=0A=
##  5  5 [Plantations/farm/garden/field]                               =
17558=0A=
##  6  6 [Construction sites]                                            =
928=0A=
##  7  7 [Mines/quarry]                                                  =
100=0A=
##  8  8 [Shop /kiosk/coffee house/restaurant/hotel]                    =
2491=0A=
##  9  9 [Different places (mobile)]                                    =
2512=0A=
## 10 10 [Fixed, street or market stall]                                 =
494=0A=
## 11 11 [Pond/lake/river /see]                                          =
151=0A=
## 12 12 [Forest]                                                        =
 30=0A=
## 13 97 [Other]                                                         =
268=0A=
## 14 NA                                                                 =
  3
  • B22. How many persons are employed at local unit of your=20 organization/enterprise/farm?
lfs_empl %>% =
count(b22__employed_at_local_unit)
## # A tibble: 7 x 2=0A=
##   b22__employed_at_local_unit     n=0A=
##                     <dbl+lbl> <int>=0A=
## 1          1 [1-10 persons]   20544=0A=
## 2          2 [11-19 persons]   2597=0A=
## 3          3 [20-49 persons]   3654=0A=
## 4          4 [50 or more]      4147=0A=
## 5         98 [Do not know]     1528=0A=
## 6         99 [Not applicable]  4855=0A=
## 7         NA                      1

Conclusion: Additional coded non-response that could also be=20 imputed.

Before imputation all coded non-response should be=20 handled!

lfs_empl_2 <- lfs_empl %>% =0A=
  haven::zap_labels() %>%=0A=
  transmute(=0A=
    quarterno, uid, memberno, p_weights,=0A=
    region, urban_rural, sex, age,=0A=
    hired =3D hired + not_identified, self_employed,=0A=
    b4_nace_1 =3D str_pad(string =3D b4_nace_1, width =3D 4, side =3D =
"l", pad =3D "0") %>% =0A=
      str_sub(1,1), =0A=
    b5_occupation =3D str_pad(string =3D b5_occupation, width =3D 4, =
side =3D "l", pad =3D "0") %>%=0A=
      str_sub(1,1),=0A=
    b6_sector =3D case_when(b6_sector =3D=3D 97 ~ 2, b6_sector =3D=3D 98 =
~ NA_real_, TRUE ~ b6_sector),=0A=
    b7_status =3D case_when(b7_status =3D=3D 97 ~ NA_real_, TRUE ~ =
b7_status),=0A=
    b8_permanency_of_the_job =3D case_when(=0A=
      self_employed =3D=3D 1 ~ 0,=0A=
      b8_permanency_of_the_job %in% 3:4 ~ 2,=0A=
      b8_permanency_of_the_job =3D=3D 98 ~ NA_real_,=0A=
      TRUE ~ b8_permanency_of_the_job=0A=
    ),=0A=
    b17_b18_net_earnings =3D case_when(=0A=
      self_employed =3D=3D 1 ~ 0,=0A=
      b17_b18_net_earnings =3D=3D 88 ~ NA_real_,=0A=
      TRUE ~ b17_b18_net_earnings=0A=
    ),=0A=
    b22__employed_at_local_unit =3D case_when(=0A=
      b22__employed_at_local_unit %in% 98:99 ~ NA_real_,=0A=
      TRUE ~ b22__employed_at_local_unit=0A=
    )=0A=
  ) %>% =0A=
  mutate(across(.cols =3D -c(quarterno, uid, memberno, age, p_weights), =
.fns =3D as.factor))
summary(lfs_empl_2)
##   quarterno              uid           memberno        =
p_weights      =0A=
##  Length:37326       Min.   :45407   Min.   : 1.000   Min.   :   9.37  =0A=
##  Class :character   1st Qu.:50964   1st Qu.: 1.000   1st Qu.: 106.19  =0A=
##  Mode  :character   Median :56262   Median : 2.000   Median : 145.76  =0A=
##                     Mean   :56229   Mean   : 2.168   Mean   : 181.13  =0A=
##                     3rd Qu.:61532   3rd Qu.: 3.000   3rd Qu.: 212.90  =0A=
##                     Max.   :66905   Max.   :15.000   Max.   :1867.20  =0A=
##                                                                       =0A=
##      region      urban_rural sex            age        hired     =
self_employed=0A=
##  15     : 4345   1:12552     1:17572   Min.   :15.00   0:21453   =
0:15873      =0A=
##  29     : 3969   2:24774     2:19754   1st Qu.:35.00   1:15873   =
1:21453      =0A=
##  38     : 3943                         Median :49.00                  =
        =0A=
##  26     : 3903                         Mean   :48.15                  =
        =0A=
##  47     : 3669                         3rd Qu.:60.00                  =
        =0A=
##  44     : 3422                         Max.   :99.00                  =
        =0A=
##  (Other):14075                                                        =
        =0A=
##    b4_nace_1     b5_occupation   b6_sector    b7_status   =0A=
##  0      :17740   6      :17320   1   : 6341   1   :15864  =0A=
##  5      : 4457   2      : 3625   2   :30977   2   :  507  =0A=
##  8      : 4205   5      : 3608   NA's:    8   3   :12583  =0A=
##  4      : 2661   7      : 2804                4   :    9  =0A=
##  7      : 2468   9      : 2504                5   : 8354  =0A=
##  (Other): 5793   (Other): 7459                NA's:    9  =0A=
##  NA's   :    2   NA's   :    6                            =0A=
##  b8_permanency_of_the_job b17_b18_net_earnings =
b22__employed_at_local_unit=0A=
##  0   :21453               0      :21453        1   :20544             =
    =0A=
##  1   :14629               3      : 4195        2   : 2597             =
    =0A=
##  2   : 1198               4      : 3500        3   : 3654             =
    =0A=
##  NA's:   46               5      : 2166        4   : 4147             =
    =0A=
##                           2      : 1487        NA's: 6384             =
    =0A=
##                           (Other): 2970                               =
    =0A=
##                           NA's   : 1555

Choose imputation method

  • Cross validation setup
  • Goal: To find the best prediction for b4_nace_1!
  • Split in test and train data
  • Procedure=20
    1. Make imputations
    2. Check prediction result
set.seed(2020)=0A=
=0A=
lfs_complete <- lfs_empl_2 %>% =0A=
  filter(across(.fns =3D negate(is.na)))=0A=
=0A=
lfs_test <- lfs_complete %>% slice_sample(prop =3D 1/3) %>% =
mutate(b4_nace_1 =3D NA)=0A=
lfs_train <- lfs_complete %>% anti_join(lfs_test, by =3D =
c("quarterno","uid","memberno"))=0A=
=0A=
lfs_incomplete <- bind_rows(lfs_test, lfs_train)

KNN(1) imputation

set.seed(2020)=0A=
=0A=
imput_knn1 <- lfs_incomplete %>%=0A=
  as.data.frame() %>% =0A=
  impute_knn(b4_nace_1 ~ sex + age + hired + self_employed + region + =
b6_sector +=0A=
               b17_b18_net_earnings,=0A=
             k =3D 1)=0A=
=0A=
x <- imput_knn1 %>% =0A=
  transmute(quarterno,uid,memberno, b4_nace_1_imput =3D b4_nace_1) =
%>% =0A=
  semi_join(lfs_test, by =3D c("quarterno","uid","memberno")) %>% =0A=
  left_join(lfs_complete %>% =
select(quarterno,uid,memberno,b4_nace_1), =0A=
            by =3D c("quarterno","uid","memberno")) %>% =0A=
  as_tibble()=0A=
=0A=
x
## # A tibble: 9,887 x 5=0A=
##    quarterno   uid memberno b4_nace_1_imput b4_nace_1=0A=
##    <chr>     <dbl>    <dbl> <fct>           =
<fct>    =0A=
##  1 94        52585        4 0               0        =0A=
##  2 93        64097        4 6               8        =0A=
##  3 93        51214        4 0               1        =0A=
##  4 91        52048        3 0               0        =0A=
##  5 92        48718        3 8               8        =0A=
##  6 91        51716        2 8               8        =0A=
##  7 93        64038        2 0               0        =0A=
##  8 91        45560        4 5               5        =0A=
##  9 92        51300        2 8               7        =0A=
## 10 93        57957        3 0               0        =0A=
## # ... with 9,877 more rows
mean(x$b4_nace_1_imput =3D=3D =
x$b4_nace_1)
## [1] 0.6858501

KNN(5) imputation

set.seed(2020)=0A=
=0A=
imput_knn5 <- lfs_incomplete %>%=0A=
  as.data.frame() %>% =0A=
  impute_knn(b4_nace_1 ~ sex + age + hired + self_employed + region + =
b6_sector +=0A=
               b17_b18_net_earnings,=0A=
             k =3D 5)=0A=
=0A=
x <- imput_knn5 %>% =0A=
  transmute(quarterno,uid,memberno, b4_nace_1_imput =3D b4_nace_1) =
%>% =0A=
  semi_join(lfs_test, by =3D c("quarterno","uid","memberno")) %>% =0A=
  left_join(lfs_complete %>% =
select(quarterno,uid,memberno,b4_nace_1), =0A=
            by =3D c("quarterno","uid","memberno")) %>% =0A=
  as_tibble()=0A=
=0A=
x
## # A tibble: 9,887 x 5=0A=
##    quarterno   uid memberno b4_nace_1_imput b4_nace_1=0A=
##    <chr>     <dbl>    <dbl> <fct>           =
<fct>    =0A=
##  1 94        52585        4 5               0        =0A=
##  2 93        64097        4 6               8        =0A=
##  3 93        51214        4 6               1        =0A=
##  4 91        52048        3 0               0        =0A=
##  5 92        48718        3 8               8        =0A=
##  6 91        51716        2 8               8        =0A=
##  7 93        64038        2 0               0        =0A=
##  8 91        45560        4 5               5        =0A=
##  9 92        51300        2 8               7        =0A=
## 10 93        57957        3 0               0        =0A=
## # ... with 9,877 more rows
mean(x$b4_nace_1_imput =3D=3D =
x$b4_nace_1)
## [1] 0.6385152

PMM imputation

set.seed(2020)=0A=
=0A=
imput_nn <- lfs_incomplete %>% =0A=
  mutate(b4_nace_1 =3D as.numeric(as.character(b4_nace_1))) %>% =0A=
  impute_pmm(b4_nace_1 ~ sex + age + hired + self_employed + region + =
b6_sector +=0A=
               b17_b18_net_earnings) %>% =0A=
  mutate(b4_nace_1 =3D as.factor(b4_nace_1))
## Warning in predict.lm(m, newdat =3D dat[ina, , drop =3D =
FALSE]): prediction from a=0A=
## rank-deficient fit may be misleading
x <- imput_nn %>% =0A=
  transmute(quarterno,uid,memberno, b4_nace_1_imput =3D b4_nace_1) =
%>% =0A=
  semi_join(lfs_test, by =3D c("quarterno","uid","memberno")) %>% =0A=
  left_join(lfs_complete %>% =
select(quarterno,uid,memberno,b4_nace_1), =0A=
            by =3D c("quarterno","uid","memberno"))=0A=
=0A=
x
## # A tibble: 9,887 x 5=0A=
##    quarterno   uid memberno b4_nace_1_imput b4_nace_1=0A=
##    <chr>     <dbl>    <dbl> <fct>           =
<fct>    =0A=
##  1 94        52585        4 1               0        =0A=
##  2 93        64097        4 6               8        =0A=
##  3 93        51214        4 1               1        =0A=
##  4 91        52048        3 1               0        =0A=
##  5 92        48718        3 7               8        =0A=
##  6 91        51716        2 8               8        =0A=
##  7 93        64038        2 0               0        =0A=
##  8 91        45560        4 6               5        =0A=
##  9 92        51300        2 8               7        =0A=
## 10 93        57957        3 0               0        =0A=
## # ... with 9,877 more rows
mean(x$b4_nace_1_imput =3D=3D =
x$b4_nace_1)
## [1] 0.3646202

missForest

set.seed(2020)=0A=
=0A=
imput_mf <- lfs_incomplete %>%=0A=
  as.data.frame() %>% =0A=
  impute_mf(b4_nace_1 ~ sex + age + hired + self_employed + region + =
b6_sector +=0A=
               b17_b18_net_earnings)
##   missForest iteration 1 in progress...done!=0A=
##   missForest iteration 2 in progress...done!=0A=
##   missForest iteration 3 in progress...done!
x <- imput_mf %>% =0A=
  transmute(quarterno,uid,memberno, b4_nace_1_imput =3D b4_nace_1) =
%>% =0A=
  semi_join(lfs_test, by =3D c("quarterno","uid","memberno")) %>% =0A=
  left_join(lfs_complete %>% =
select(quarterno,uid,memberno,b4_nace_1), =0A=
            by =3D c("quarterno","uid","memberno")) %>% =0A=
  as_tibble()=0A=
=0A=
x
## # A tibble: 9,887 x 5=0A=
##    quarterno   uid memberno b4_nace_1_imput b4_nace_1=0A=
##    <chr>     <dbl>    <dbl> <fct>           =
<fct>    =0A=
##  1 94        52585        4 0               0        =0A=
##  2 93        64097        4 6               8        =0A=
##  3 93        51214        4 0               1        =0A=
##  4 91        52048        3 0               0        =0A=
##  5 92        48718        3 8               8        =0A=
##  6 91        51716        2 8               8        =0A=
##  7 93        64038        2 0               0        =0A=
##  8 91        45560        4 3               5        =0A=
##  9 92        51300        2 8               7        =0A=
## 10 93        57957        3 0               0        =0A=
## # ... with 9,877 more rows
mean(x$b4_nace_1_imput =3D=3D =
x$b4_nace_1)
## [1] 0.6629918

** Which one is best? I go further with knn(1) donor = imputation**

Do imputation of missing values i survey

Do the same imputation on the ls_empl_2 data:

lfs_empl_3 <- lfs_empl_2 %>%=0A=
  mutate(b4_nace_1_old =3D b4_nace_1) %>% =0A=
  as.data.frame() %>% =0A=
  impute_knn(b4_nace_1 ~ sex + age + hired + self_employed + region + =
b6_sector +=0A=
               b17_b18_net_earnings,=0A=
             k =3D 1) %>% =0A=
  as_tibble()=0A=
=0A=
lfs_empl_3 %>% =0A=
  group_by(b4_nace_1) %>% =0A=
  summarise(n =3D n(), n_old =3D n() - sum(is.na(b4_nace_1_old)),=0A=
            N =3D sum(p_weights), N_old =3D =
sum(as.numeric(!is.na(b4_nace_1_old))*p_weights))
## `summarise()` ungrouping output (override with `.groups` =
argument)
## # A tibble: 10 x 5=0A=
##    b4_nace_1     n n_old        N    N_old=0A=
##    <fct>     <int> <int>    <dbl>    =
<dbl>=0A=
##  1 0         17740 17740 2579549. 2579549.=0A=
##  2 1          1187  1187  230747.  230747.=0A=
##  3 2           698   698  158682.  158682.=0A=
##  4 3           170   170   45479.   45479.=0A=
##  5 4          2661  2661  502654.  502654.=0A=
##  6 5          4457  4457  995600.  995600.=0A=
##  7 6          2150  2150  505875.  505875.=0A=
##  8 7          2470  2468  541231.  539523.=0A=
##  9 8          4205  4205  849789.  849789.=0A=
## 10 9          1588  1588  351074.  =
351074.
=20 =20 =20 =