Data Wrangling

R for Public Health

Ashwini Kalantri

Department of Community Medicine, MGIMS

24 Sep 2024

Import Data

library(readr)
data <- read_csv("data.csv")

Understand structure of dataset

str(data)            
spc_tbl_ [150 × 28] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
 $ phc         : chr [1:150] "Anji" "Kharangana" "Anji" "Anji" ...
 $ sex         : chr [1:150] "Male" "Female" "Male" "Male" ...
 $ dob         : Date[1:150], format: "1999-10-01" "1999-09-16" ...
 $ age         : num [1:150] 23 23 30 18 29 18 12 27 25 30 ...
 $ gps_lat     : num [1:150] 20.8 20.7 20.8 20.8 20.6 ...
 $ gps_long    : num [1:150] 78.6 78.7 78.5 78.7 78.6 ...
 $ edu         : chr [1:150] "Graduate and above" "Higher Secondary" "Higher Secondary" "Secondary" ...
 $ curSmoke    : chr [1:150] "No" "No" "No" "No" ...
 $ curSmokeless: chr [1:150] "No" "No" "Yes" "No" ...
 $ alcEver     : chr [1:150] "No" "No" "No" "No" ...
 $ met_cat     : chr [1:150] "No" "Yes" "No" "Yes" ...
 $ TotalMetmin : num [1:150] 240 2160 240 720 1800 480 120 200 480 180 ...
 $ wt          : num [1:150] 62.5 48.2 68 48.4 67.8 46.6 32.9 79 53.3 47.5 ...
 $ ht          : num [1:150] 170 153 169 176 NA ...
 $ Glucose     : num [1:150] 80 75 128 88 86 71 63 158 70 80 ...
 $ Cholesterol : num [1:150] 133 139 142 122 126 141 127 98 183 144 ...
 $ wealth_index: chr [1:150] "Highest" "Second" "Second" "Highest" ...
 $ stress      : chr [1:150] "Moderate stress" "High perceived stress" "Moderate stress" "Low stress" ...
 $ depression  : chr [1:150] "No" "No" "Yes" "No" ...
 $ anxiety     : chr [1:150] "No" "No" "No" "No" ...
 $ sbp         : num [1:150] 120 106 116 97 109 99 110 105 101 117 ...
 $ dbp         : num [1:150] 78 71 67 62 68 66 68 65 82 70 ...
 $ sbp1        : num [1:150] 120 106 116 97 109 99 110 105 101 117 ...
 $ sbp2        : num [1:150] 127 109 112 115 106 103 109 108 108 111 ...
 $ sbp3        : num [1:150] 117 93 112 115 106 105 121 115 114 101 ...
 $ dbp1        : num [1:150] 78 71 67 62 68 66 68 65 82 70 ...
 $ dbp2        : num [1:150] 82 84 74 78 70 70 66 70 87 73 ...
 $ dbp3        : num [1:150] 85 67 75 86 64 70 81 71 80 68 ...
 - attr(*, "spec")=
  .. cols(
  ..   phc = col_character(),
  ..   sex = col_character(),
  ..   dob = col_date(format = ""),
  ..   age = col_double(),
  ..   gps_lat = col_double(),
  ..   gps_long = col_double(),
  ..   edu = col_character(),
  ..   curSmoke = col_character(),
  ..   curSmokeless = col_character(),
  ..   alcEver = col_character(),
  ..   met_cat = col_character(),
  ..   TotalMetmin = col_double(),
  ..   wt = col_double(),
  ..   ht = col_double(),
  ..   Glucose = col_double(),
  ..   Cholesterol = col_double(),
  ..   wealth_index = col_character(),
  ..   stress = col_character(),
  ..   depression = col_character(),
  ..   anxiety = col_character(),
  ..   sbp = col_double(),
  ..   dbp = col_double(),
  ..   sbp1 = col_double(),
  ..   sbp2 = col_double(),
  ..   sbp3 = col_double(),
  ..   dbp1 = col_double(),
  ..   dbp2 = col_double(),
  ..   dbp3 = col_double()
  .. )
 - attr(*, "problems")=<externalptr> 

Understand structure of dataset

library(dplyr)
glimpse(data)
Rows: 150
Columns: 28
$ phc          <chr> "Anji", "Kharangana", "Anji", "Anji", "Talegaon", "Talega…
$ sex          <chr> "Male", "Female", "Male", "Male", "Male", "Female", "Fema…
$ dob          <date> 1999-10-01, 1999-09-16, 1992-09-07, 2004-04-20, 1993-11-…
$ age          <dbl> 23, 23, 30, 18, 29, 18, 12, 27, 25, 30, 28, 25, 13, 11, 2…
$ gps_lat      <dbl> 20.78229, 20.71299, 20.83439, 20.78233, 20.61487, 20.6314…
$ gps_long     <dbl> 78.62309, 78.68535, 78.50479, 78.66968, 78.64125, 78.6088…
$ edu          <chr> "Graduate and above", "Higher Secondary", "Higher Seconda…
$ curSmoke     <chr> "No", "No", "No", "No", "No", "No", "No", "No", "No", "No…
$ curSmokeless <chr> "No", "No", "Yes", "No", "Yes", "No", "No", "No", "Yes", …
$ alcEver      <chr> "No", "No", "No", "No", "No", "No", "No", "No", "No", "No…
$ met_cat      <chr> "No", "Yes", "No", "Yes", "Yes", "No", "No", "No", "No", …
$ TotalMetmin  <dbl> 240, 2160, 240, 720, 1800, 480, 120, 200, 480, 180, 720, …
$ wt           <dbl> 62.5, 48.2, 68.0, 48.4, 67.8, 46.6, 32.9, 79.0, 53.3, 47.…
$ ht           <dbl> 170.0, 152.9, 169.4, 176.0, NA, 157.6, 152.2, 171.0, 164.…
$ Glucose      <dbl> 80, 75, 128, 88, 86, 71, 63, 158, 70, 80, 110, 100, 78, 8…
$ Cholesterol  <dbl> 133, 139, 142, 122, 126, 141, 127, 98, 183, 144, 135, 143…
$ wealth_index <chr> "Highest", "Second", "Second", "Highest", "Highest", "Mid…
$ stress       <chr> "Moderate stress", "High perceived stress", "Moderate str…
$ depression   <chr> "No", "No", "Yes", "No", "Yes", "No", "No", "No", "No", "…
$ anxiety      <chr> "No", "No", "No", "No", "No", "No", "No", "No", "Yes", "N…
$ sbp          <dbl> 120, 106, 116, 97, 109, 99, 110, 105, 101, 117, 128, 106,…
$ dbp          <dbl> 78, 71, 67, 62, 68, 66, 68, 65, 82, 70, 72, 80, 66, 57, 6…
$ sbp1         <dbl> 120, 106, 116, 97, 109, 99, 110, 105, 101, 117, 128, 106,…
$ sbp2         <dbl> 127, 109, 112, 115, 106, 103, 109, 108, 108, 111, 121, 11…
$ sbp3         <dbl> 117, 93, 112, 115, 106, 105, 121, 115, 114, 101, 125, 91,…
$ dbp1         <dbl> 78, 71, 67, 62, 68, 66, 68, 65, 82, 70, 72, 80, 66, 57, 6…
$ dbp2         <dbl> 82, 84, 74, 78, 70, 70, 66, 70, 87, 73, 69, 77, 72, 97, 6…
$ dbp3         <dbl> 85, 67, 75, 86, 64, 70, 81, 71, 80, 68, 69, 78, 72, 52, 6…

Understand structure of dataset

names(data)
 [1] "phc"          "sex"          "dob"          "age"          "gps_lat"     
 [6] "gps_long"     "edu"          "curSmoke"     "curSmokeless" "alcEver"     
[11] "met_cat"      "TotalMetmin"  "wt"           "ht"           "Glucose"     
[16] "Cholesterol"  "wealth_index" "stress"       "depression"   "anxiety"     
[21] "sbp"          "dbp"          "sbp1"         "sbp2"         "sbp3"        
[26] "dbp1"         "dbp2"         "dbp3"        

Understand structure of dataset

head(data)
# A tibble: 6 × 28
  phc        sex   dob          age gps_lat gps_long edu   curSmoke curSmokeless
  <chr>      <chr> <date>     <dbl>   <dbl>    <dbl> <chr> <chr>    <chr>       
1 Anji       Male  1999-10-01    23    20.8     78.6 Grad… No       No          
2 Kharangana Fema… 1999-09-16    23    20.7     78.7 High… No       No          
3 Anji       Male  1992-09-07    30    20.8     78.5 High… No       Yes         
4 Anji       Male  2004-04-20    18    20.8     78.7 Seco… No       No          
5 Talegaon   Male  1993-11-13    29    20.6     78.6 High… No       Yes         
6 Talegaon   Fema… 2004-07-05    18    20.6     78.6 High… No       No          
# ℹ 19 more variables: alcEver <chr>, met_cat <chr>, TotalMetmin <dbl>,
#   wt <dbl>, ht <dbl>, Glucose <dbl>, Cholesterol <dbl>, wealth_index <chr>,
#   stress <chr>, depression <chr>, anxiety <chr>, sbp <dbl>, dbp <dbl>,
#   sbp1 <dbl>, sbp2 <dbl>, sbp3 <dbl>, dbp1 <dbl>, dbp2 <dbl>, dbp3 <dbl>

5 verbs of dplyr

  • Select

  • Arrange

  • Filter

  • Mutate

  • Summarize

Select

Select

select(.data = data, phc, sex)
# A tibble: 150 × 2
   phc        sex   
   <chr>      <chr> 
 1 Anji       Male  
 2 Kharangana Female
 3 Anji       Male  
 4 Anji       Male  
 5 Talegaon   Male  
 6 Talegaon   Female
 7 Kharangana Female
 8 Anji       Male  
 9 Talegaon   Male  
10 Waifad     Female
# ℹ 140 more rows

Select

select(data, phc, sex) 
# A tibble: 150 × 2
   phc        sex   
   <chr>      <chr> 
 1 Anji       Male  
 2 Kharangana Female
 3 Anji       Male  
 4 Anji       Male  
 5 Talegaon   Male  
 6 Talegaon   Female
 7 Kharangana Female
 8 Anji       Male  
 9 Talegaon   Male  
10 Waifad     Female
# ℹ 140 more rows

Select

data %>% 
  select(phc, sex) 
# A tibble: 150 × 2
   phc        sex   
   <chr>      <chr> 
 1 Anji       Male  
 2 Kharangana Female
 3 Anji       Male  
 4 Anji       Male  
 5 Talegaon   Male  
 6 Talegaon   Female
 7 Kharangana Female
 8 Anji       Male  
 9 Talegaon   Male  
10 Waifad     Female
# ℹ 140 more rows

Select

data %>%
  select(-curSmokeless, -edu, -age)
# A tibble: 150 × 25
   phc    sex   dob        gps_lat gps_long curSmoke alcEver met_cat TotalMetmin
   <chr>  <chr> <date>       <dbl>    <dbl> <chr>    <chr>   <chr>         <dbl>
 1 Anji   Male  1999-10-01    20.8     78.6 No       No      No              240
 2 Khara… Fema… 1999-09-16    20.7     78.7 No       No      Yes            2160
 3 Anji   Male  1992-09-07    20.8     78.5 No       No      No              240
 4 Anji   Male  2004-04-20    20.8     78.7 No       No      Yes             720
 5 Taleg… Male  1993-11-13    20.6     78.6 No       No      Yes            1800
 6 Taleg… Fema… 2004-07-05    20.6     78.6 No       No      No              480
 7 Khara… Fema… 2010-09-24    20.7     78.7 No       No      No              120
 8 Anji   Male  1995-01-03    20.8     78.7 No       No      No              200
 9 Taleg… Male  1997-06-11    20.6     78.6 No       No      No              480
10 Waifad Fema… 1992-07-01    20.7     78.5 No       No      No              180
# ℹ 140 more rows
# ℹ 16 more variables: wt <dbl>, ht <dbl>, Glucose <dbl>, Cholesterol <dbl>,
#   wealth_index <chr>, stress <chr>, depression <chr>, anxiety <chr>,
#   sbp <dbl>, dbp <dbl>, sbp1 <dbl>, sbp2 <dbl>, sbp3 <dbl>, dbp1 <dbl>,
#   dbp2 <dbl>, dbp3 <dbl>

Select using helpers

data %>%
  select(last_col())
# A tibble: 150 × 1
    dbp3
   <dbl>
 1    85
 2    67
 3    75
 4    86
 5    64
 6    70
 7    81
 8    71
 9    80
10    68
# ℹ 140 more rows

Select using helpers

data %>%
  select(starts_with("sbp"))
# A tibble: 150 × 4
     sbp  sbp1  sbp2  sbp3
   <dbl> <dbl> <dbl> <dbl>
 1   120   120   127   117
 2   106   106   109    93
 3   116   116   112   112
 4    97    97   115   115
 5   109   109   106   106
 6    99    99   103   105
 7   110   110   109   121
 8   105   105   108   115
 9   101   101   108   114
10   117   117   111   101
# ℹ 140 more rows

Select using helpers

data %>%
  select(ends_with("bp3"))
# A tibble: 150 × 2
    sbp3  dbp3
   <dbl> <dbl>
 1   117    85
 2    93    67
 3   112    75
 4   115    86
 5   106    64
 6   105    70
 7   121    81
 8   115    71
 9   114    80
10   101    68
# ℹ 140 more rows

Select using helpers

data %>%
  select(contains("met"))
# A tibble: 150 × 2
   met_cat TotalMetmin
   <chr>         <dbl>
 1 No              240
 2 Yes            2160
 3 No              240
 4 Yes             720
 5 Yes            1800
 6 No              480
 7 No              120
 8 No              200
 9 No              480
10 No              180
# ℹ 140 more rows

Select using helpers

data %>%
  select(num_range("sbp", 1:3))
# A tibble: 150 × 3
    sbp1  sbp2  sbp3
   <dbl> <dbl> <dbl>
 1   120   127   117
 2   106   109    93
 3   116   112   112
 4    97   115   115
 5   109   106   106
 6    99   103   105
 7   110   109   121
 8   105   108   115
 9   101   108   114
10   117   111   101
# ℹ 140 more rows

Select using helpers

data %>%
  group_by(sex,edu) %>% 
  select(group_cols())
# A tibble: 150 × 2
# Groups:   sex, edu [8]
   sex    edu                    
   <chr>  <chr>                  
 1 Male   Graduate and above     
 2 Female Higher Secondary       
 3 Male   Higher Secondary       
 4 Male   Secondary              
 5 Male   Higher Secondary       
 6 Female Higher Secondary       
 7 Female No or primary schooling
 8 Male   Graduate and above     
 9 Male   Secondary              
10 Female Secondary              
# ℹ 140 more rows

Arrange

Arrange

arrange(.data = data, age)
# A tibble: 150 × 28
   phc       sex   dob          age gps_lat gps_long edu   curSmoke curSmokeless
   <chr>     <chr> <date>     <dbl>   <dbl>    <dbl> <chr> <chr>    <chr>       
 1 Anji      Male  2012-02-22    10    20.8     78.7 No o… No       No          
 2 Anji      Fema… 2012-01-28    10    20.8     78.6 No o… No       No          
 3 Anji      Male  2012-01-11    10    20.8     78.5 No o… No       No          
 4 Kharanga… Fema… 2011-09-12    11    20.7     78.7 No o… No       No          
 5 Anji      Fema… 2011-10-22    11    20.8     78.7 No o… No       No          
 6 Kharanga… Male  2011-09-29    11    20.7     78.7 No o… No       No          
 7 Anji      Male  2011-05-14    11    20.9     78.5 No o… No       No          
 8 Kharanga… Fema… 2010-09-24    12    20.7     78.7 No o… No       No          
 9 Kharanga… Male  2010-03-22    12    20.7     78.6 No o… No       No          
10 Talegaon  Fema… 2009-11-29    12    20.7     78.5 No o… No       No          
# ℹ 140 more rows
# ℹ 19 more variables: alcEver <chr>, met_cat <chr>, TotalMetmin <dbl>,
#   wt <dbl>, ht <dbl>, Glucose <dbl>, Cholesterol <dbl>, wealth_index <chr>,
#   stress <chr>, depression <chr>, anxiety <chr>, sbp <dbl>, dbp <dbl>,
#   sbp1 <dbl>, sbp2 <dbl>, sbp3 <dbl>, dbp1 <dbl>, dbp2 <dbl>, dbp3 <dbl>

Arrange

data %>%
  select(phc, age, ht, wt) %>% 
  arrange(ht)
# A tibble: 150 × 4
   phc          age    ht    wt
   <chr>      <dbl> <dbl> <dbl>
 1 Anji          10  134   30.9
 2 Anji          10  136.  22.5
 3 Kharangana    11  137   29  
 4 Kharangana    16  138.  39.4
 5 Anji          13  140   27  
 6 Anji          12  140.  30.5
 7 Kharangana    11  143.  33.5
 8 Talegaon      25  143.  42.6
 9 Anji          12  144.  35  
10 Anji          27  144   35.2
# ℹ 140 more rows

Arrange

data %>%
  select(phc, age, ht, wt) %>%
  arrange(desc(ht))
# A tibble: 150 × 4
   phc          age    ht    wt
   <chr>      <dbl> <dbl> <dbl>
 1 Anji          25  178.  64.1
 2 Talegaon      24  177   93.9
 3 Talegaon      15  177   59.5
 4 Anji          18  176   48.4
 5 Anji          23  175.  60  
 6 Kharangana    30  175   55.3
 7 Kharangana    22  175   57.5
 8 Talegaon      20  175   53.2
 9 Anji          29  175.  72  
10 Talegaon      30  174.  89.2
# ℹ 140 more rows

Arrange

data %>%
  select(phc, age, ht, wt) %>%
  arrange(age,ht)
# A tibble: 150 × 4
   phc          age    ht    wt
   <chr>      <dbl> <dbl> <dbl>
 1 Anji          10  134   30.9
 2 Anji          10  136.  22.5
 3 Anji          10   NA   NA  
 4 Kharangana    11  137   29  
 5 Kharangana    11  143.  33.5
 6 Anji          11  144.  32.4
 7 Anji          11  156.  39.5
 8 Anji          12  140.  30.5
 9 Anji          12  144.  35  
10 Talegaon      12  145.  33.2
# ℹ 140 more rows

Arrange

data %>%
  select(phc, wt, sex, ht) %>% 
  group_by(phc, sex) %>% 
  arrange(ht, .by_group = T)
# A tibble: 150 × 4
# Groups:   phc, sex [8]
   phc      wt sex       ht
   <chr> <dbl> <chr>  <dbl>
 1 Anji   30.9 Female  134 
 2 Anji   35.2 Female  144 
 3 Anji   39.3 Female  146.
 4 Anji   NA   Female  147 
 5 Anji   36.9 Female  150.
 6 Anji   45   Female  152.
 7 Anji   65.3 Female  152.
 8 Anji   NA   Female  152.
 9 Anji   50.5 Female  152.
10 Anji   43.8 Female  152.
# ℹ 140 more rows

Ungroup

data %>%
  select(phc, wt, sex, ht) %>% 
  group_by(phc, sex) %>%
  arrange(ht, .by_group = T) %>%   
  ungroup()
# A tibble: 150 × 4
   phc      wt sex       ht
   <chr> <dbl> <chr>  <dbl>
 1 Anji   30.9 Female  134 
 2 Anji   35.2 Female  144 
 3 Anji   39.3 Female  146.
 4 Anji   NA   Female  147 
 5 Anji   36.9 Female  150.
 6 Anji   45   Female  152.
 7 Anji   65.3 Female  152.
 8 Anji   NA   Female  152.
 9 Anji   50.5 Female  152.
10 Anji   43.8 Female  152.
# ℹ 140 more rows

Filter

Filter

filter(.data=data, phc == "Anji")
# A tibble: 62 × 28
   phc   sex    dob          age gps_lat gps_long edu      curSmoke curSmokeless
   <chr> <chr>  <date>     <dbl>   <dbl>    <dbl> <chr>    <chr>    <chr>       
 1 Anji  Male   1999-10-01    23    20.8     78.6 Graduat… No       No          
 2 Anji  Male   1992-09-07    30    20.8     78.5 Higher … No       Yes         
 3 Anji  Male   2004-04-20    18    20.8     78.7 Seconda… No       No          
 4 Anji  Male   1995-01-03    27    20.8     78.7 Graduat… No       No          
 5 Anji  Female 1994-07-27    28    20.8     78.5 Graduat… No       No          
 6 Anji  Male   1995-01-12    27    20.8     78.6 Graduat… No       No          
 7 Anji  Male   2009-06-06    13    20.9     78.5 Seconda… No       No          
 8 Anji  Male   2006-11-21    16    20.8     78.5 Seconda… No       No          
 9 Anji  Female 2006-04-07    16    20.8     78.6 No or p… No       No          
10 Anji  Male   2007-04-25    15    20.9     78.5 Seconda… No       No          
# ℹ 52 more rows
# ℹ 19 more variables: alcEver <chr>, met_cat <chr>, TotalMetmin <dbl>,
#   wt <dbl>, ht <dbl>, Glucose <dbl>, Cholesterol <dbl>, wealth_index <chr>,
#   stress <chr>, depression <chr>, anxiety <chr>, sbp <dbl>, dbp <dbl>,
#   sbp1 <dbl>, sbp2 <dbl>, sbp3 <dbl>, dbp1 <dbl>, dbp2 <dbl>, dbp3 <dbl>

Filter

data %>%
  filter(phc == "Anji")
# A tibble: 62 × 28
   phc   sex    dob          age gps_lat gps_long edu      curSmoke curSmokeless
   <chr> <chr>  <date>     <dbl>   <dbl>    <dbl> <chr>    <chr>    <chr>       
 1 Anji  Male   1999-10-01    23    20.8     78.6 Graduat… No       No          
 2 Anji  Male   1992-09-07    30    20.8     78.5 Higher … No       Yes         
 3 Anji  Male   2004-04-20    18    20.8     78.7 Seconda… No       No          
 4 Anji  Male   1995-01-03    27    20.8     78.7 Graduat… No       No          
 5 Anji  Female 1994-07-27    28    20.8     78.5 Graduat… No       No          
 6 Anji  Male   1995-01-12    27    20.8     78.6 Graduat… No       No          
 7 Anji  Male   2009-06-06    13    20.9     78.5 Seconda… No       No          
 8 Anji  Male   2006-11-21    16    20.8     78.5 Seconda… No       No          
 9 Anji  Female 2006-04-07    16    20.8     78.6 No or p… No       No          
10 Anji  Male   2007-04-25    15    20.9     78.5 Seconda… No       No          
# ℹ 52 more rows
# ℹ 19 more variables: alcEver <chr>, met_cat <chr>, TotalMetmin <dbl>,
#   wt <dbl>, ht <dbl>, Glucose <dbl>, Cholesterol <dbl>, wealth_index <chr>,
#   stress <chr>, depression <chr>, anxiety <chr>, sbp <dbl>, dbp <dbl>,
#   sbp1 <dbl>, sbp2 <dbl>, sbp3 <dbl>, dbp1 <dbl>, dbp2 <dbl>, dbp3 <dbl>

Filter

data %>%
  filter(phc != "Anji")
# A tibble: 88 × 28
   phc       sex   dob          age gps_lat gps_long edu   curSmoke curSmokeless
   <chr>     <chr> <date>     <dbl>   <dbl>    <dbl> <chr> <chr>    <chr>       
 1 Kharanga… Fema… 1999-09-16    23    20.7     78.7 High… No       No          
 2 Talegaon  Male  1993-11-13    29    20.6     78.6 High… No       Yes         
 3 Talegaon  Fema… 2004-07-05    18    20.6     78.6 High… No       No          
 4 Kharanga… Fema… 2010-09-24    12    20.7     78.7 No o… No       No          
 5 Talegaon  Male  1997-06-11    25    20.6     78.6 Seco… No       Yes         
 6 Waifad    Fema… 1992-07-01    30    20.7     78.5 Seco… No       No          
 7 Talegaon  Male  1997-08-15    25    20.7     78.6 High… No       Yes         
 8 Waifad    Male  2009-03-13    13    20.7     78.5 No o… No       No          
 9 Kharanga… Fema… 2011-09-12    11    20.7     78.7 No o… No       No          
10 Talegaon  Fema… 2004-02-28    18    20.6     78.7 High… No       No          
# ℹ 78 more rows
# ℹ 19 more variables: alcEver <chr>, met_cat <chr>, TotalMetmin <dbl>,
#   wt <dbl>, ht <dbl>, Glucose <dbl>, Cholesterol <dbl>, wealth_index <chr>,
#   stress <chr>, depression <chr>, anxiety <chr>, sbp <dbl>, dbp <dbl>,
#   sbp1 <dbl>, sbp2 <dbl>, sbp3 <dbl>, dbp1 <dbl>, dbp2 <dbl>, dbp3 <dbl>

Filter

data %>%
  filter(ht >= 150)
# A tibble: 124 × 28
   phc       sex   dob          age gps_lat gps_long edu   curSmoke curSmokeless
   <chr>     <chr> <date>     <dbl>   <dbl>    <dbl> <chr> <chr>    <chr>       
 1 Anji      Male  1999-10-01    23    20.8     78.6 Grad… No       No          
 2 Kharanga… Fema… 1999-09-16    23    20.7     78.7 High… No       No          
 3 Anji      Male  1992-09-07    30    20.8     78.5 High… No       Yes         
 4 Anji      Male  2004-04-20    18    20.8     78.7 Seco… No       No          
 5 Talegaon  Fema… 2004-07-05    18    20.6     78.6 High… No       No          
 6 Kharanga… Fema… 2010-09-24    12    20.7     78.7 No o… No       No          
 7 Anji      Male  1995-01-03    27    20.8     78.7 Grad… No       No          
 8 Talegaon  Male  1997-06-11    25    20.6     78.6 Seco… No       Yes         
 9 Anji      Fema… 1994-07-27    28    20.8     78.5 Grad… No       No          
10 Talegaon  Male  1997-08-15    25    20.7     78.6 High… No       Yes         
# ℹ 114 more rows
# ℹ 19 more variables: alcEver <chr>, met_cat <chr>, TotalMetmin <dbl>,
#   wt <dbl>, ht <dbl>, Glucose <dbl>, Cholesterol <dbl>, wealth_index <chr>,
#   stress <chr>, depression <chr>, anxiety <chr>, sbp <dbl>, dbp <dbl>,
#   sbp1 <dbl>, sbp2 <dbl>, sbp3 <dbl>, dbp1 <dbl>, dbp2 <dbl>, dbp3 <dbl>

Filter

data %>%
  filter(!is.na(ht))
# A tibble: 147 × 28
   phc       sex   dob          age gps_lat gps_long edu   curSmoke curSmokeless
   <chr>     <chr> <date>     <dbl>   <dbl>    <dbl> <chr> <chr>    <chr>       
 1 Anji      Male  1999-10-01    23    20.8     78.6 Grad… No       No          
 2 Kharanga… Fema… 1999-09-16    23    20.7     78.7 High… No       No          
 3 Anji      Male  1992-09-07    30    20.8     78.5 High… No       Yes         
 4 Anji      Male  2004-04-20    18    20.8     78.7 Seco… No       No          
 5 Talegaon  Fema… 2004-07-05    18    20.6     78.6 High… No       No          
 6 Kharanga… Fema… 2010-09-24    12    20.7     78.7 No o… No       No          
 7 Anji      Male  1995-01-03    27    20.8     78.7 Grad… No       No          
 8 Talegaon  Male  1997-06-11    25    20.6     78.6 Seco… No       Yes         
 9 Waifad    Fema… 1992-07-01    30    20.7     78.5 Seco… No       No          
10 Anji      Fema… 1994-07-27    28    20.8     78.5 Grad… No       No          
# ℹ 137 more rows
# ℹ 19 more variables: alcEver <chr>, met_cat <chr>, TotalMetmin <dbl>,
#   wt <dbl>, ht <dbl>, Glucose <dbl>, Cholesterol <dbl>, wealth_index <chr>,
#   stress <chr>, depression <chr>, anxiety <chr>, sbp <dbl>, dbp <dbl>,
#   sbp1 <dbl>, sbp2 <dbl>, sbp3 <dbl>, dbp1 <dbl>, dbp2 <dbl>, dbp3 <dbl>

Filter

data %>%
  filter(is.na(ht) & is.na(wt))
# A tibble: 1 × 28
  phc   sex   dob          age gps_lat gps_long edu        curSmoke curSmokeless
  <chr> <chr> <date>     <dbl>   <dbl>    <dbl> <chr>      <chr>    <chr>       
1 Anji  Male  2012-01-11    10    20.8     78.5 No or pri… No       No          
# ℹ 19 more variables: alcEver <chr>, met_cat <chr>, TotalMetmin <dbl>,
#   wt <dbl>, ht <dbl>, Glucose <dbl>, Cholesterol <dbl>, wealth_index <chr>,
#   stress <chr>, depression <chr>, anxiety <chr>, sbp <dbl>, dbp <dbl>,
#   sbp1 <dbl>, sbp2 <dbl>, sbp3 <dbl>, dbp1 <dbl>, dbp2 <dbl>, dbp3 <dbl>

Filter

data %>%
  filter(is.na(ht) | is.na(wt))
# A tibble: 6 × 28
  phc      sex    dob          age gps_lat gps_long edu    curSmoke curSmokeless
  <chr>    <chr>  <date>     <dbl>   <dbl>    <dbl> <chr>  <chr>    <chr>       
1 Talegaon Male   1993-11-13    29    20.6     78.6 Highe… No       Yes         
2 Talegaon Female 2006-04-20    16    20.6     78.6 Secon… No       No          
3 Waifad   Male   1993-08-13    29    20.8     78.5 Highe… No       Yes         
4 Anji     Female 2004-02-23    18    20.8     78.7 Highe… No       No          
5 Anji     Female 1996-06-06    26    20.8     78.5 Secon… No       No          
6 Anji     Male   2012-01-11    10    20.8     78.5 No or… No       No          
# ℹ 19 more variables: alcEver <chr>, met_cat <chr>, TotalMetmin <dbl>,
#   wt <dbl>, ht <dbl>, Glucose <dbl>, Cholesterol <dbl>, wealth_index <chr>,
#   stress <chr>, depression <chr>, anxiety <chr>, sbp <dbl>, dbp <dbl>,
#   sbp1 <dbl>, sbp2 <dbl>, sbp3 <dbl>, dbp1 <dbl>, dbp2 <dbl>, dbp3 <dbl>

Filter

data %>%
  filter(!is.na(ht) & !is.na(wt))
# A tibble: 144 × 28
   phc       sex   dob          age gps_lat gps_long edu   curSmoke curSmokeless
   <chr>     <chr> <date>     <dbl>   <dbl>    <dbl> <chr> <chr>    <chr>       
 1 Anji      Male  1999-10-01    23    20.8     78.6 Grad… No       No          
 2 Kharanga… Fema… 1999-09-16    23    20.7     78.7 High… No       No          
 3 Anji      Male  1992-09-07    30    20.8     78.5 High… No       Yes         
 4 Anji      Male  2004-04-20    18    20.8     78.7 Seco… No       No          
 5 Talegaon  Fema… 2004-07-05    18    20.6     78.6 High… No       No          
 6 Kharanga… Fema… 2010-09-24    12    20.7     78.7 No o… No       No          
 7 Anji      Male  1995-01-03    27    20.8     78.7 Grad… No       No          
 8 Talegaon  Male  1997-06-11    25    20.6     78.6 Seco… No       Yes         
 9 Waifad    Fema… 1992-07-01    30    20.7     78.5 Seco… No       No          
10 Anji      Fema… 1994-07-27    28    20.8     78.5 Grad… No       No          
# ℹ 134 more rows
# ℹ 19 more variables: alcEver <chr>, met_cat <chr>, TotalMetmin <dbl>,
#   wt <dbl>, ht <dbl>, Glucose <dbl>, Cholesterol <dbl>, wealth_index <chr>,
#   stress <chr>, depression <chr>, anxiety <chr>, sbp <dbl>, dbp <dbl>,
#   sbp1 <dbl>, sbp2 <dbl>, sbp3 <dbl>, dbp1 <dbl>, dbp2 <dbl>, dbp3 <dbl>

Filter

data %>%
  filter(!is.na(ht & wt)) 
# A tibble: 144 × 28
   phc       sex   dob          age gps_lat gps_long edu   curSmoke curSmokeless
   <chr>     <chr> <date>     <dbl>   <dbl>    <dbl> <chr> <chr>    <chr>       
 1 Anji      Male  1999-10-01    23    20.8     78.6 Grad… No       No          
 2 Kharanga… Fema… 1999-09-16    23    20.7     78.7 High… No       No          
 3 Anji      Male  1992-09-07    30    20.8     78.5 High… No       Yes         
 4 Anji      Male  2004-04-20    18    20.8     78.7 Seco… No       No          
 5 Talegaon  Fema… 2004-07-05    18    20.6     78.6 High… No       No          
 6 Kharanga… Fema… 2010-09-24    12    20.7     78.7 No o… No       No          
 7 Anji      Male  1995-01-03    27    20.8     78.7 Grad… No       No          
 8 Talegaon  Male  1997-06-11    25    20.6     78.6 Seco… No       Yes         
 9 Waifad    Fema… 1992-07-01    30    20.7     78.5 Seco… No       No          
10 Anji      Fema… 1994-07-27    28    20.8     78.5 Grad… No       No          
# ℹ 134 more rows
# ℹ 19 more variables: alcEver <chr>, met_cat <chr>, TotalMetmin <dbl>,
#   wt <dbl>, ht <dbl>, Glucose <dbl>, Cholesterol <dbl>, wealth_index <chr>,
#   stress <chr>, depression <chr>, anxiety <chr>, sbp <dbl>, dbp <dbl>,
#   sbp1 <dbl>, sbp2 <dbl>, sbp3 <dbl>, dbp1 <dbl>, dbp2 <dbl>, dbp3 <dbl>

Mutate

Mutate

data$NewVar <- data$dob

Mutate

data$NewVar
  [1] "1999-10-01" "1999-09-16" "1992-09-07" "2004-04-20" "1993-11-13"
  [6] "2004-07-05" "2010-09-24" "1995-01-03" "1997-06-11" "1992-07-01"
 [11] "1994-07-27" "1997-08-15" "2009-03-13" "2011-09-12" "1995-01-12"
 [16] "2004-02-28" "2006-04-20" "1995-08-25" "2009-06-06" "2006-11-21"
 [21] "2006-04-07" "2007-04-25" "1998-09-17" "1995-09-09" "2009-07-30"
 [26] "1997-04-26" "1992-10-02" "2011-10-22" "2001-10-04" "1998-03-13"
 [31] "1992-07-01" "2004-01-01" "2000-11-12" "2005-12-23" "1993-01-15"
 [36] "2009-06-06" "2002-06-29" "2004-07-10" "1993-11-30" "1997-07-01"
 [41] "2000-12-30" "2008-08-22" "2001-08-01" "1994-11-15" "1993-01-01"
 [46] "1993-07-24" "1998-01-01" "2004-07-16" "1997-12-20" "2007-08-13"
 [51] "2003-09-17" "1993-05-15" "2000-07-15" "1996-05-19" "2010-03-22"
 [56] "1995-09-01" "1993-08-13" "2001-08-28" "2007-12-01" "2004-04-24"
 [61] "2008-01-08" "2008-01-21" "1994-07-25" "2008-11-16" "2007-07-11"
 [66] "2006-08-17" "2004-04-02" "1994-09-14" "1992-09-28" "1995-02-23"
 [71] "2000-09-06" "2006-01-01" "2004-02-23" "2007-05-07" "2002-03-16"
 [76] "1998-10-18" "1995-09-01" "1992-07-01" "1993-01-14" "2002-07-29"
 [81] "2009-11-29" "2008-11-04" "2006-11-30" "2004-04-21" "2006-11-23"
 [86] "1996-05-24" "1995-10-22" "2001-12-03" "2003-09-17" "1994-10-11"
 [91] "1998-10-06" "2011-09-29" "2012-02-22" "1997-12-24" "1999-10-15"
 [96] "2007-02-08" "2007-02-11" "2008-11-11" "1996-06-06" "2003-02-07"
[101] "1998-10-09" "2000-08-26" "2009-02-02" "2000-07-22" "2007-05-14"
[106] "2002-12-27" "1997-03-31" "2010-01-05" "1998-11-18" "1993-06-16"
[111] "2008-04-30" "2011-05-14" "1996-10-19" "2001-05-18" "1997-01-29"
[116] "2003-07-27" "2008-03-28" "1999-08-19" "2008-04-15" "2000-11-30"
[121] "2010-08-15" "2001-02-21" "2001-09-22" "1999-12-11" "2012-01-28"
[126] "1995-11-18" "2006-02-05" "2012-01-11" "2002-07-22" "1995-10-01"
[131] "2006-09-07" "2002-01-06" "2004-06-27" "2003-04-13" "2001-10-13"
[136] "2005-08-19" "1995-03-19" "1999-10-08" "1999-09-27" "2003-07-13"
[141] "2001-09-07" "2008-06-14" "1996-07-09" "1995-06-21" "2006-05-29"
[146] "2006-07-14" "2001-01-31" "2005-03-08" "2003-06-19" "2002-11-03"

Mutate

Null will remove that variable being specified. Here it will be removing NewVar

data$NewVar <- NULL
data$NewVar
NULL

Mutate

mutate(data,
       bmi = wt / ((ht / 100) ^ 2)) %>%
  select(ht, wt, bmi)
# A tibble: 150 × 3
      ht    wt   bmi
   <dbl> <dbl> <dbl>
 1  170   62.5  21.6
 2  153.  48.2  20.6
 3  169.  68    23.7
 4  176   48.4  15.6
 5   NA   67.8  NA  
 6  158.  46.6  18.8
 7  152.  32.9  14.2
 8  171   79    27.0
 9  164.  53.3  19.7
10  150.  47.5  21.2
# ℹ 140 more rows

Mutate

data %>%
  select(ht, wt) %>%
  mutate(ht_m = ht / 100,
         bmi = wt / (ht_m ^ 2))
# A tibble: 150 × 4
      ht    wt  ht_m   bmi
   <dbl> <dbl> <dbl> <dbl>
 1  170   62.5  1.7   21.6
 2  153.  48.2  1.53  20.6
 3  169.  68    1.69  23.7
 4  176   48.4  1.76  15.6
 5   NA   67.8 NA     NA  
 6  158.  46.6  1.58  18.8
 7  152.  32.9  1.52  14.2
 8  171   79    1.71  27.0
 9  164.  53.3  1.64  19.7
10  150.  47.5  1.50  21.2
# ℹ 140 more rows

Mutate

data %>%
  mutate(ht_m = ht/100,
         bmi = round(wt/(ht_m^2),2))
# A tibble: 150 × 30
   phc       sex   dob          age gps_lat gps_long edu   curSmoke curSmokeless
   <chr>     <chr> <date>     <dbl>   <dbl>    <dbl> <chr> <chr>    <chr>       
 1 Anji      Male  1999-10-01    23    20.8     78.6 Grad… No       No          
 2 Kharanga… Fema… 1999-09-16    23    20.7     78.7 High… No       No          
 3 Anji      Male  1992-09-07    30    20.8     78.5 High… No       Yes         
 4 Anji      Male  2004-04-20    18    20.8     78.7 Seco… No       No          
 5 Talegaon  Male  1993-11-13    29    20.6     78.6 High… No       Yes         
 6 Talegaon  Fema… 2004-07-05    18    20.6     78.6 High… No       No          
 7 Kharanga… Fema… 2010-09-24    12    20.7     78.7 No o… No       No          
 8 Anji      Male  1995-01-03    27    20.8     78.7 Grad… No       No          
 9 Talegaon  Male  1997-06-11    25    20.6     78.6 Seco… No       Yes         
10 Waifad    Fema… 1992-07-01    30    20.7     78.5 Seco… No       No          
# ℹ 140 more rows
# ℹ 21 more variables: alcEver <chr>, met_cat <chr>, TotalMetmin <dbl>,
#   wt <dbl>, ht <dbl>, Glucose <dbl>, Cholesterol <dbl>, wealth_index <chr>,
#   stress <chr>, depression <chr>, anxiety <chr>, sbp <dbl>, dbp <dbl>,
#   sbp1 <dbl>, sbp2 <dbl>, sbp3 <dbl>, dbp1 <dbl>, dbp2 <dbl>, dbp3 <dbl>,
#   ht_m <dbl>, bmi <dbl>

Mutate

data %>% 
  mutate(ht_m = ht/100, 
         bmi = round(wt/(ht_m^2),2),
         dob = as.Date(dob,format = "%d-%m-%Y")) %>% 
  select(ht_m,bmi,dob)
# A tibble: 150 × 3
    ht_m   bmi dob       
   <dbl> <dbl> <date>    
 1  1.7   21.6 1999-10-01
 2  1.53  20.6 1999-09-16
 3  1.69  23.7 1992-09-07
 4  1.76  15.6 2004-04-20
 5 NA     NA   1993-11-13
 6  1.58  18.8 2004-07-05
 7  1.52  14.2 2010-09-24
 8  1.71  27.0 1995-01-03
 9  1.64  19.7 1997-06-11
10  1.50  21.2 1992-07-01
# ℹ 140 more rows

Mutate

#conditional mutating, if-then-else function
data %>%
  mutate(highSBP = if_else(sbp1 > 120,
                           "High",
                           "Normal")) %>% 
  select(highSBP,sbp1)
# A tibble: 150 × 2
   highSBP  sbp1
   <chr>   <dbl>
 1 Normal    120
 2 Normal    106
 3 Normal    116
 4 Normal     97
 5 Normal    109
 6 Normal     99
 7 Normal    110
 8 Normal    105
 9 Normal    101
10 Normal    117
# ℹ 140 more rows

Mutate

#categorizing conditional formatting
data %>%
  mutate(weightCat = case_when(wt < 40 ~ "low",
                               wt >= 40 & wt < 60 ~ "Average",
                               wt >= 60 ~ "High")) %>%
  select(wt, weightCat)
# A tibble: 150 × 2
      wt weightCat
   <dbl> <chr>    
 1  62.5 High     
 2  48.2 Average  
 3  68   High     
 4  48.4 Average  
 5  67.8 High     
 6  46.6 Average  
 7  32.9 low      
 8  79   High     
 9  53.3 Average  
10  47.5 Average  
# ℹ 140 more rows

Rename

#Changing the column names
data %>% 
  rename(education = edu) %>% 
  head()
# A tibble: 6 × 28
  phc    sex   dob          age gps_lat gps_long education curSmoke curSmokeless
  <chr>  <chr> <date>     <dbl>   <dbl>    <dbl> <chr>     <chr>    <chr>       
1 Anji   Male  1999-10-01    23    20.8     78.6 Graduate… No       No          
2 Khara… Fema… 1999-09-16    23    20.7     78.7 Higher S… No       No          
3 Anji   Male  1992-09-07    30    20.8     78.5 Higher S… No       Yes         
4 Anji   Male  2004-04-20    18    20.8     78.7 Secondary No       No          
5 Taleg… Male  1993-11-13    29    20.6     78.6 Higher S… No       Yes         
6 Taleg… Fema… 2004-07-05    18    20.6     78.6 Higher S… No       No          
# ℹ 19 more variables: alcEver <chr>, met_cat <chr>, TotalMetmin <dbl>,
#   wt <dbl>, ht <dbl>, Glucose <dbl>, Cholesterol <dbl>, wealth_index <chr>,
#   stress <chr>, depression <chr>, anxiety <chr>, sbp <dbl>, dbp <dbl>,
#   sbp1 <dbl>, sbp2 <dbl>, sbp3 <dbl>, dbp1 <dbl>, dbp2 <dbl>, dbp3 <dbl>

Relocate

data %>% 
  relocate(sex, .after = age)
# A tibble: 150 × 28
   phc       dob          age sex   gps_lat gps_long edu   curSmoke curSmokeless
   <chr>     <date>     <dbl> <chr>   <dbl>    <dbl> <chr> <chr>    <chr>       
 1 Anji      1999-10-01    23 Male     20.8     78.6 Grad… No       No          
 2 Kharanga… 1999-09-16    23 Fema…    20.7     78.7 High… No       No          
 3 Anji      1992-09-07    30 Male     20.8     78.5 High… No       Yes         
 4 Anji      2004-04-20    18 Male     20.8     78.7 Seco… No       No          
 5 Talegaon  1993-11-13    29 Male     20.6     78.6 High… No       Yes         
 6 Talegaon  2004-07-05    18 Fema…    20.6     78.6 High… No       No          
 7 Kharanga… 2010-09-24    12 Fema…    20.7     78.7 No o… No       No          
 8 Anji      1995-01-03    27 Male     20.8     78.7 Grad… No       No          
 9 Talegaon  1997-06-11    25 Male     20.6     78.6 Seco… No       Yes         
10 Waifad    1992-07-01    30 Fema…    20.7     78.5 Seco… No       No          
# ℹ 140 more rows
# ℹ 19 more variables: alcEver <chr>, met_cat <chr>, TotalMetmin <dbl>,
#   wt <dbl>, ht <dbl>, Glucose <dbl>, Cholesterol <dbl>, wealth_index <chr>,
#   stress <chr>, depression <chr>, anxiety <chr>, sbp <dbl>, dbp <dbl>,
#   sbp1 <dbl>, sbp2 <dbl>, sbp3 <dbl>, dbp1 <dbl>, dbp2 <dbl>, dbp3 <dbl>

Relocate

data %>% 
  relocate(sex, .before = edu)
# A tibble: 150 × 28
   phc       dob          age gps_lat gps_long sex   edu   curSmoke curSmokeless
   <chr>     <date>     <dbl>   <dbl>    <dbl> <chr> <chr> <chr>    <chr>       
 1 Anji      1999-10-01    23    20.8     78.6 Male  Grad… No       No          
 2 Kharanga… 1999-09-16    23    20.7     78.7 Fema… High… No       No          
 3 Anji      1992-09-07    30    20.8     78.5 Male  High… No       Yes         
 4 Anji      2004-04-20    18    20.8     78.7 Male  Seco… No       No          
 5 Talegaon  1993-11-13    29    20.6     78.6 Male  High… No       Yes         
 6 Talegaon  2004-07-05    18    20.6     78.6 Fema… High… No       No          
 7 Kharanga… 2010-09-24    12    20.7     78.7 Fema… No o… No       No          
 8 Anji      1995-01-03    27    20.8     78.7 Male  Grad… No       No          
 9 Talegaon  1997-06-11    25    20.6     78.6 Male  Seco… No       Yes         
10 Waifad    1992-07-01    30    20.7     78.5 Fema… Seco… No       No          
# ℹ 140 more rows
# ℹ 19 more variables: alcEver <chr>, met_cat <chr>, TotalMetmin <dbl>,
#   wt <dbl>, ht <dbl>, Glucose <dbl>, Cholesterol <dbl>, wealth_index <chr>,
#   stress <chr>, depression <chr>, anxiety <chr>, sbp <dbl>, dbp <dbl>,
#   sbp1 <dbl>, sbp2 <dbl>, sbp3 <dbl>, dbp1 <dbl>, dbp2 <dbl>, dbp3 <dbl>

Relocate

data %>% 
  relocate(sex, .after = last_col())
# A tibble: 150 × 28
   phc     dob          age gps_lat gps_long edu   curSmoke curSmokeless alcEver
   <chr>   <date>     <dbl>   <dbl>    <dbl> <chr> <chr>    <chr>        <chr>  
 1 Anji    1999-10-01    23    20.8     78.6 Grad… No       No           No     
 2 Kharan… 1999-09-16    23    20.7     78.7 High… No       No           No     
 3 Anji    1992-09-07    30    20.8     78.5 High… No       Yes          No     
 4 Anji    2004-04-20    18    20.8     78.7 Seco… No       No           No     
 5 Talega… 1993-11-13    29    20.6     78.6 High… No       Yes          No     
 6 Talega… 2004-07-05    18    20.6     78.6 High… No       No           No     
 7 Kharan… 2010-09-24    12    20.7     78.7 No o… No       No           No     
 8 Anji    1995-01-03    27    20.8     78.7 Grad… No       No           No     
 9 Talega… 1997-06-11    25    20.6     78.6 Seco… No       Yes          No     
10 Waifad  1992-07-01    30    20.7     78.5 Seco… No       No           No     
# ℹ 140 more rows
# ℹ 19 more variables: met_cat <chr>, TotalMetmin <dbl>, wt <dbl>, ht <dbl>,
#   Glucose <dbl>, Cholesterol <dbl>, wealth_index <chr>, stress <chr>,
#   depression <chr>, anxiety <chr>, sbp <dbl>, dbp <dbl>, sbp1 <dbl>,
#   sbp2 <dbl>, sbp3 <dbl>, dbp1 <dbl>, dbp2 <dbl>, dbp3 <dbl>, sex <chr>

Summarise

Summarise

data %>%
  select(wt, ht) %>%
  summarise(wt_mean = mean(wt), 
            wt_sd = sd(wt), 
            ht_mean = mean(ht), 
            ht_sd = sd(ht))
# A tibble: 1 × 4
  wt_mean wt_sd ht_mean ht_sd
    <dbl> <dbl>   <dbl> <dbl>
1      NA    NA      NA    NA

Summarise

data %>%
  summarise(wt_mean = mean(wt, na.rm = TRUE), 
            wt_sd = sd(wt, na.rm = TRUE), 
            ht_mean = mean(ht, na.rm = TRUE), 
            ht_sd = sd(ht, na.rm = TRUE))
# A tibble: 1 × 4
  wt_mean wt_sd ht_mean ht_sd
    <dbl> <dbl>   <dbl> <dbl>
1    51.8  13.9    159.  10.1

Summarise

data %>%
  select(wt, ht) %>%
  na.omit() %>%
  summarise(wt_mean = mean(wt), 
            wt_sd = sd(wt), 
            ht_mean = mean(ht), 
            ht_sd = sd(ht))
# A tibble: 1 × 4
  wt_mean wt_sd ht_mean ht_sd
    <dbl> <dbl>   <dbl> <dbl>
1    51.7  13.9    159.  10.1

Summarise

data %>%
  select(wt, ht) %>%
  summarise(across(c(ht, wt), ~mean(.,na.rm = TRUE)))
# A tibble: 1 × 2
     ht    wt
  <dbl> <dbl>
1  159.  51.8

Summarise

data %>%
  select(wt, ht, sbp, dbp) %>%
  na.omit() %>%
  summarise(across(everything(), list(min = min, max = max)))
# A tibble: 1 × 8
  wt_min wt_max ht_min ht_max sbp_min sbp_max dbp_min dbp_max
   <dbl>  <dbl>  <dbl>  <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
1   22.5   93.9    134   178.      83     142      54      99

Summarise

data %>%
  select(wt, ht, sex, sbp) %>%
  na.omit() %>%
  mutate(highSBP = if_else(sbp > 120, "High", "Normal")) %>%
  group_by(sex, highSBP) %>%
  summarise(across(everything(), list(mean, sd, median, min, max)))
# A tibble: 4 × 17
# Groups:   sex [2]
  sex    highSBP  wt_1  wt_2  wt_3  wt_4  wt_5  ht_1  ht_2  ht_3  ht_4  ht_5
  <chr>  <chr>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Female High     51.3  12.0  50.5  36.9  73    153.  4.78  152.  145   162 
2 Female Normal   46.3  10.1  43.8  30.9  73    152.  5.81  153   134   162 
3 Male   High     54.2  17.2  56.3  27    84.2  161. 11.0   162   140   175.
4 Male   Normal   54.8  14.7  53.2  22.5  93.9  164. 10.1   165.  136.  178.
# ℹ 5 more variables: sbp_1 <dbl>, sbp_2 <dbl>, sbp_3 <dbl>, sbp_4 <dbl>,
#   sbp_5 <dbl>

Summarise

data %>%
  select(wt, ht, sex, phc) %>%
  group_by(sex) %>%
  summarise(across(where(is.numeric), ~mean(.,na.rm = T)))
# A tibble: 2 × 3
  sex       wt    ht
  <chr>  <dbl> <dbl>
1 Female  47.2  153.
2 Male    54.8  163.

Summarise

data %>%
  group_by(phc, sex) %>%
  summarise(N = n())
# A tibble: 8 × 3
# Groups:   phc [4]
  phc        sex        N
  <chr>      <chr>  <int>
1 Anji       Female    18
2 Anji       Male      44
3 Kharangana Female    14
4 Kharangana Male      12
5 Talegaon   Female    22
6 Talegaon   Male      23
7 Waifad     Female     7
8 Waifad     Male      10

Pivot Long to Wide

library(tidyr)
library(janitor)

data %>% 
  group_by(phc,sex) %>% 
  summarise(N = n()) %>% 
  pivot_wider(names_from = sex,
              values_from = N)
# A tibble: 4 × 3
# Groups:   phc [4]
  phc        Female  Male
  <chr>       <int> <int>
1 Anji           18    44
2 Kharangana     14    12
3 Talegaon       22    23
4 Waifad          7    10

Add totals

data %>% 
  group_by(curSmokeless,edu) %>% 
  summarise(N = n()) %>% 
  pivot_wider(names_from = curSmokeless,
              values_from = N) %>% 
  mutate(across(everything(), ~replace_na(.,0))) %>% 
  adorn_totals(where = c("row","col"))
                     edu  No Yes Total
      Graduate and above  14   0    14
        Higher Secondary  49  11    60
 No or primary schooling  37   5    42
               Secondary  27   7    34
                   Total 127  23   150

Pivot Wide to Long

data %>% 
  select(sbp1,sbp2,sbp3) %>% 
  pivot_longer(cols = contains("sbp"),
               names_to = "sbp_no",
               values_to = "bp")
# A tibble: 450 × 2
   sbp_no    bp
   <chr>  <dbl>
 1 sbp1     120
 2 sbp2     127
 3 sbp3     117
 4 sbp1     106
 5 sbp2     109
 6 sbp3      93
 7 sbp1     116
 8 sbp2     112
 9 sbp3     112
10 sbp1      97
# ℹ 440 more rows

Lead Lag

data %>% 
  mutate(id = 1:n()) %>% 
  select(id,sbp1,sbp3) %>% 
  pivot_longer(cols = contains("sbp"),
               names_to = "sbp_no",
               values_to = "bp") %>% 
  group_by(id) %>% 
  mutate(diff = bp - lag(bp))
# A tibble: 300 × 4
# Groups:   id [150]
      id sbp_no    bp  diff
   <int> <chr>  <dbl> <dbl>
 1     1 sbp1     120    NA
 2     1 sbp3     117    -3
 3     2 sbp1     106    NA
 4     2 sbp3      93   -13
 5     3 sbp1     116    NA
 6     3 sbp3     112    -4
 7     4 sbp1      97    NA
 8     4 sbp3     115    18
 9     5 sbp1     109    NA
10     5 sbp3     106    -3
# ℹ 290 more rows

Joining data

data_sbp <- data %>% 
  mutate(id = 1:n()) %>% 
  select(id,sbp1,sbp2,sbp3) %>% 
  pivot_longer(cols = contains("sbp"),
               names_to = "sbp_no",
               values_to = "bp")

data %>%
  mutate(id = 1:n()) %>% 
  select(id,phc,age,sex,ht,wt) %>% 
  left_join(data_sbp)
# A tibble: 450 × 8
      id phc          age sex       ht    wt sbp_no    bp
   <int> <chr>      <dbl> <chr>  <dbl> <dbl> <chr>  <dbl>
 1     1 Anji          23 Male    170   62.5 sbp1     120
 2     1 Anji          23 Male    170   62.5 sbp2     127
 3     1 Anji          23 Male    170   62.5 sbp3     117
 4     2 Kharangana    23 Female  153.  48.2 sbp1     106
 5     2 Kharangana    23 Female  153.  48.2 sbp2     109
 6     2 Kharangana    23 Female  153.  48.2 sbp3      93
 7     3 Anji          30 Male    169.  68   sbp1     116
 8     3 Anji          30 Male    169.  68   sbp2     112
 9     3 Anji          30 Male    169.  68   sbp3     112
10     4 Anji          18 Male    176   48.4 sbp1      97
# ℹ 440 more rows

Joining data

data %>%
  mutate(id = 1:n()) %>% 
  select(id,phc,age,sex,ht,wt) %>% 
  left_join(data_sbp,
            by = "id")
# A tibble: 450 × 8
      id phc          age sex       ht    wt sbp_no    bp
   <int> <chr>      <dbl> <chr>  <dbl> <dbl> <chr>  <dbl>
 1     1 Anji          23 Male    170   62.5 sbp1     120
 2     1 Anji          23 Male    170   62.5 sbp2     127
 3     1 Anji          23 Male    170   62.5 sbp3     117
 4     2 Kharangana    23 Female  153.  48.2 sbp1     106
 5     2 Kharangana    23 Female  153.  48.2 sbp2     109
 6     2 Kharangana    23 Female  153.  48.2 sbp3      93
 7     3 Anji          30 Male    169.  68   sbp1     116
 8     3 Anji          30 Male    169.  68   sbp2     112
 9     3 Anji          30 Male    169.  68   sbp3     112
10     4 Anji          18 Male    176   48.4 sbp1      97
# ℹ 440 more rows

Joining data

data %>%
  mutate(id = 1:n()) %>% 
  select(id,phc,age,sex,ht,wt) %>% 
  left_join(data_sbp,
            by = "id",
            keep = TRUE)
# A tibble: 450 × 9
    id.x phc          age sex       ht    wt  id.y sbp_no    bp
   <int> <chr>      <dbl> <chr>  <dbl> <dbl> <int> <chr>  <dbl>
 1     1 Anji          23 Male    170   62.5     1 sbp1     120
 2     1 Anji          23 Male    170   62.5     1 sbp2     127
 3     1 Anji          23 Male    170   62.5     1 sbp3     117
 4     2 Kharangana    23 Female  153.  48.2     2 sbp1     106
 5     2 Kharangana    23 Female  153.  48.2     2 sbp2     109
 6     2 Kharangana    23 Female  153.  48.2     2 sbp3      93
 7     3 Anji          30 Male    169.  68       3 sbp1     116
 8     3 Anji          30 Male    169.  68       3 sbp2     112
 9     3 Anji          30 Male    169.  68       3 sbp3     112
10     4 Anji          18 Male    176   48.4     4 sbp1      97
# ℹ 440 more rows

Joining data

data_sbp %>%
  right_join(data %>% 
               mutate(id = 1:n()) %>% 
               select(id,phc,age,sex,ht,wt),
             by = "id")
# A tibble: 450 × 8
      id sbp_no    bp phc          age sex       ht    wt
   <int> <chr>  <dbl> <chr>      <dbl> <chr>  <dbl> <dbl>
 1     1 sbp1     120 Anji          23 Male    170   62.5
 2     1 sbp2     127 Anji          23 Male    170   62.5
 3     1 sbp3     117 Anji          23 Male    170   62.5
 4     2 sbp1     106 Kharangana    23 Female  153.  48.2
 5     2 sbp2     109 Kharangana    23 Female  153.  48.2
 6     2 sbp3      93 Kharangana    23 Female  153.  48.2
 7     3 sbp1     116 Anji          30 Male    169.  68  
 8     3 sbp2     112 Anji          30 Male    169.  68  
 9     3 sbp3     112 Anji          30 Male    169.  68  
10     4 sbp1      97 Anji          18 Male    176   48.4
# ℹ 440 more rows

Joining data

data %>%
  mutate(id = 1:n()) %>% 
  select(id, phc, age, sex, ht, wt) %>%
  filter(age > 15) %>%
  inner_join(data_sbp,
            by = "id")
# A tibble: 342 × 8
      id phc          age sex       ht    wt sbp_no    bp
   <int> <chr>      <dbl> <chr>  <dbl> <dbl> <chr>  <dbl>
 1     1 Anji          23 Male    170   62.5 sbp1     120
 2     1 Anji          23 Male    170   62.5 sbp2     127
 3     1 Anji          23 Male    170   62.5 sbp3     117
 4     2 Kharangana    23 Female  153.  48.2 sbp1     106
 5     2 Kharangana    23 Female  153.  48.2 sbp2     109
 6     2 Kharangana    23 Female  153.  48.2 sbp3      93
 7     3 Anji          30 Male    169.  68   sbp1     116
 8     3 Anji          30 Male    169.  68   sbp2     112
 9     3 Anji          30 Male    169.  68   sbp3     112
10     4 Anji          18 Male    176   48.4 sbp1      97
# ℹ 332 more rows

Joining data

data %>%
  mutate(id = 1:n()) %>% 
  select(id, phc, age, sex, ht, wt) %>% 
  filter(age > 20) %>% 
  bind_rows(data %>%
              mutate(id = 1:n()) %>% 
              select(id, phc, age, sex, ht, wt) %>% 
              filter(age <= 20))
# A tibble: 150 × 6
      id phc          age sex       ht    wt
   <int> <chr>      <dbl> <chr>  <dbl> <dbl>
 1     1 Anji          23 Male    170   62.5
 2     2 Kharangana    23 Female  153.  48.2
 3     3 Anji          30 Male    169.  68  
 4     5 Talegaon      29 Male     NA   67.8
 5     8 Anji          27 Male    171   79  
 6     9 Talegaon      25 Male    164.  53.3
 7    10 Waifad        30 Female  150.  47.5
 8    11 Anji          28 Female  162   59.1
 9    12 Talegaon      25 Male    170   69.6
10    15 Anji          27 Male    167   60.4
# ℹ 140 more rows

Joining data

data %>% 
  select(phc, age, sex, ht, wt) %>% 
  bind_cols(data %>% 
              select(dob, sbp, dbp))
# A tibble: 150 × 8
   phc          age sex       ht    wt dob          sbp   dbp
   <chr>      <dbl> <chr>  <dbl> <dbl> <date>     <dbl> <dbl>
 1 Anji          23 Male    170   62.5 1999-10-01   120    78
 2 Kharangana    23 Female  153.  48.2 1999-09-16   106    71
 3 Anji          30 Male    169.  68   1992-09-07   116    67
 4 Anji          18 Male    176   48.4 2004-04-20    97    62
 5 Talegaon      29 Male     NA   67.8 1993-11-13   109    68
 6 Talegaon      18 Female  158.  46.6 2004-07-05    99    66
 7 Kharangana    12 Female  152.  32.9 2010-09-24   110    68
 8 Anji          27 Male    171   79   1995-01-03   105    65
 9 Talegaon      25 Male    164.  53.3 1997-06-11   101    82
10 Waifad        30 Female  150.  47.5 1992-07-01   117    70
# ℹ 140 more rows