Skip to contents

Load packages and data

## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(dietaryindex)
## Loaded dietaryindex
## Thank you for using dietaryindex!
## Tutorials: https://github.com/jamesjiadazhan/dietaryindex
## Questions, issues: Follow the prompts at https://github.com/jamesjiadazhan/dietaryindex/blob/main/.github/ISSUE_TEMPLATE/bug_report.md
## Cite us:  citation('dietaryindex') or
## Jiada James Zhan, Rebecca A Hodge, Anne Dunlop, et al. Dietaryindex: A User-Friendly and Versatile R Package for Standardizing Dietary Pattern Analysis in Epidemiological and Clinical Studies. bioRxiv. Published online August 09, 2023:2023.08.09.548466. doi:10.1101/2023.08.07.548466
# include COMPLEX SURVEY DESIGN #######################################
library(survey)
## Loading required package: grid
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## Loading required package: survival
## 
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
## 
##     dotchart
options(survey.lonely.psu = "adjust") #accounts for the lonely psu problem from subsetting survey data to small groups

# Load the data
data("DASH_trial")
data("PREDIMED_trial")
data("NHANES_20172018")

# set up working dictionary
setwd("/Users/yan/Desktop/NHANES_combined")

# Load the NHANES data from 2005 to 2018
## NHANES 2005-2006
load("NHANES_20052006.rda")
## NHANES 2007-2008
load("NHANES_20072008.rda")
## NHANES 2009-2010
load("NHANES_20092010.rda")
## NHANES 2011-2012
load("NHANES_20112012.rda")
## NHANES 2013-2014
load("NHANES_20132014.rda")
## NHANES 2015-2016
load("NHANES_20152016.rda")

Dietary index calculations

# DASHI from the DASH trial
DASHI_DASH = DASHI(
    SERV_DATA = DASH_trial, 
    RESPONDENTID = DASH_trial$Diet_Type,
    TOTALKCAL_DASHI = DASH_trial$Kcal, 
    TOTAL_FAT_DASHI = DASH_trial$Totalfat_Percent, 
    SAT_FAT_DASHI = DASH_trial$Satfat_Percent, 
    PROTEIN_DASHI = DASH_trial$Protein_Percent, 
    CHOLESTEROL_DASHI = DASH_trial$Cholesterol, 
    FIBER_DASHI = DASH_trial$Fiber, 
    POTASSIUM_DASHI = DASH_trial$Potassium, 
    MAGNESIUM_DASHI = DASH_trial$Magnesium, 
    CALCIUM_DASHI = DASH_trial$Calcium, 
    SODIUM_DASHI = DASH_trial$Sodium)

# MEDI for the PREDIMED trial
MEDI_PREDIMED = MEDI(
  SERV_DATA = PREDIMED_trial,
  RESPONDENTID = PREDIMED_trial$Diet_Type,
  OLIVE_OIL_SERV_MEDI = PREDIMED_trial$Virgin_Oliveoil,
  FRT_SERV_MEDI = PREDIMED_trial$Fruits, 
  VEG_SERV_MEDI = PREDIMED_trial$Vegetables,
  LEGUMES_SERV_MEDI = PREDIMED_trial$Legumes,
  NUTS_SERV_MEDI = PREDIMED_trial$Total_nuts,
  FISH_SEAFOOD_SERV_MEDI = PREDIMED_trial$Fish_Seafood,
  ALCOHOL_SERV_MEDI = PREDIMED_trial$Alcohol,
  SSB_SERV_MEDI = PREDIMED_trial$Soda_Drinks,
  SWEETS_SERV_MEDI = PREDIMED_trial$Sweets,
  DISCRET_FAT_SERV_MEDI = PREDIMED_trial$Refined_Oliveoil,
  REDPROC_MEAT_SERV_MEDI = PREDIMED_trial$Meat)

# set up survey design for NHANES data 2017-2018 day 1 and day 2
## filter out the missing values for the weight variable WTDR2D
## select only persons aged two and older
NHANES_20172018_design_d1d2 = NHANES_20172018$FPED %>%
    filter(!is.na(WTDR2D)) %>%
    filter(RIDAGEYR >= 2)

# filter only the individuals with reliable recall status for FPED, FPED_IND, NUTRIENT, NUTRIENT_IND, FPED2, FPED_IND2, NUTRIENT2, NUTRIENT_IND2
## DAY 1: DR1DRSTZ == 1
## DAY 2: DR2DRSTZ == 1
# also, filter out those kcal intake < 500 or > 5000
## DAY 1: DR1TKCAL >= 500 & DR1TKCAL <= 5000
## DAY 2: DR2TKCAL >= 500 & DR2TKCAL <= 5000
NHANES_20172018$FPED_IND = NHANES_20172018$FPED_IND %>%
    filter(DR1DRSTZ == 1)
NHANES_20172018$NUTRIENT_IND = NHANES_20172018$NUTRIENT_IND %>%
    filter(DR1DRSTZ == 1)
NHANES_20172018$FPED = NHANES_20172018$FPED %>%
    filter(DR1DRSTZ == 1)
NHANES_20172018$NUTRIENT = NHANES_20172018$NUTRIENT %>%
    filter(DR1DRSTZ == 1) %>%
    filter(DR1TKCAL >= 500 & DR1TKCAL <= 5000)

NHANES_20172018$FPED_IND2 = NHANES_20172018$FPED_IND2 %>%
    filter(DR2DRSTZ == 1)
NHANES_20172018$NUTRIENT_IND2 = NHANES_20172018$NUTRIENT_IND2 %>%
    filter(DR2DRSTZ == 1)
NHANES_20172018$FPED2 = NHANES_20172018$FPED2 %>%
    filter(DR2DRSTZ == 1)
NHANES_20172018$NUTRIENT2 = NHANES_20172018$NUTRIENT2 %>%
    filter(DR2DRSTZ == 1) %>%
    filter(DR2TKCAL >= 500 & DR2TKCAL <= 5000)

## NHANES 2017-2018
# DASHI for day 1 and day 2
DASHI_NHANES = DASHI_NHANES_FPED(NUTRIENT_PATH=NHANES_20172018$NUTRIENT, NUTRIENT_PATH2=NHANES_20172018$NUTRIENT2)

# MEDI for day 1 and day 2
MEDI_NHANES = MEDI_NHANES_FPED(FPED_IND_PATH=NHANES_20172018$FPED_IND, NUTRIENT_IND_PATH=NHANES_20172018$NUTRIENT_IND, FPED_IND_PATH2=NHANES_20172018$FPED_IND2, NUTRIENT_IND_PATH2=NHANES_20172018$NUTRIENT_IND2)
## [1] "Since no SSB code is provided, the default SSB code from 17-18 FNDDS file is used."
## [1] "Since no SWEETS code is provided, the default SSB code from 17-18 FNDDS file is used"
## [1] "Since no FAT_OIL code is provided, the default FAT_OIL code from 17-18 FNDDS file is used"
## [1] "Since no OLIVE_OIL code is provided, the default OLIVE_OIL code from 17-18 FNDDS file is used"
# DASH for day 1 and day 2
DASH_NHANES = DASH_NHANES_FPED(NHANES_20172018$FPED_IND, NHANES_20172018$NUTRIENT_IND, NHANES_20172018$FPED_IND2, NHANES_20172018$NUTRIENT_IND2)
## [1] "Since no SSB code is provided, the default SSB code from 17-18 FNDDS file is used."
## [1] "Since no skim milk code is provided, the default skim milk code from 17-18 FNDDS file is used."
## [1] "Since no low-fat cheese code is provided, the default low-fat cheese code from 17-18 FNDDS file is used."
## [1] "Reminder: this DASH index uses quintiles to rank participants' food/drink serving sizes and then calculate DASH component scores, which may generate results that are specific to your study population but not comparable to other populations."
# MED for day 1 and day 2
MED_NHANES = MED_NHANES_FPED(FPED_PATH=NHANES_20172018$FPED, NUTRIENT_PATH=NHANES_20172018$NUTRIENT, DEMO_PATH=NHANES_20172018$DEMO, FPED_PATH2=NHANES_20172018$FPED2, NUTRIENT_PATH2=NHANES_20172018$NUTRIENT2)
## [1] "Reminder: this MED index uses medians to rank participants' food/drink serving sizes and then calculate MED component scores, which may generate results that are specific to your study population but not comparable to other populations."
# AHEI for day 1 and day 2
AHEI_NHANES = AHEI_NHANES_FPED(NHANES_20172018$FPED_IND, NHANES_20172018$NUTRIENT_IND, NHANES_20172018$FPED_IND2, NHANES_20172018$NUTRIENT_IND2)
## [1] "Since no SSB code is provided, the default SSB code from 17-18 FNDDS file is used."
# DII for day 1 and day 2
DII_NHANES = DII_NHANES_FPED(FPED_PATH=NHANES_20172018$FPED, NUTRIENT_PATH=NHANES_20172018$NUTRIENT, DEMO_PATH=NHANES_20172018$DEMO, FPED_PATH2=NHANES_20172018$FPED2, NUTRIENT_PATH2=NHANES_20172018$NUTRIENT2)
## [1] "VITD is included in the calculation in the first day of NHANES data."
## [1] "VITD is included in the calculation in the second day of NHANES data."
## [1] "Reminder: This function does not use all the original DII variables. Eugenol, garlic, ginger, onion, trans fat, turmeric, Green/black tea, Flavan-3-ol, Flavones, Flavonols, Flavonones, Anthocyanidins, Isoflavones, Pepper, Thyme/oregano, Rosemary are not included because they are not available in NHANES."
# HEI2020 for day 1 and day 2
HEI2020_NHANES_1718 = HEI2020_NHANES_FPED(FPED_PATH=NHANES_20172018$FPED, NUTRIENT_PATH=NHANES_20172018$NUTRIENT, DEMO_PATH=NHANES_20172018$DEMO, FPED_PATH2=NHANES_20172018$FPED2, NUTRIENT_PATH2=NHANES_20172018$NUTRIENT2)

# Merge all the previous data frames into one data frame by SEQN
NHANES_20172018_dietaryindex_d1d2 = inner_join(NHANES_20172018_design_d1d2, DASHI_NHANES, by = "SEQN") %>%
    inner_join(MEDI_NHANES, by = "SEQN") %>%
    inner_join(DASH_NHANES, by = "SEQN") %>%
    inner_join(MED_NHANES, by = "SEQN") %>%
    inner_join(AHEI_NHANES, by = "SEQN") %>%
    inner_join(DII_NHANES, by = "SEQN") %>%
    inner_join(HEI2020_NHANES_1718, by = "SEQN")

# set up survey design for NHANES data 2017-2018 day 1 and day 2
NHANES_design_1718_d1d2 <- svydesign(
    id = ~SDMVPSU, 
    strata = ~SDMVSTRA, 
    weight = ~WTDR2D, 
    data = NHANES_20172018_dietaryindex_d1d2, #set up survey design on the full dataset #can restrict at time of analysis 
    nest = TRUE) 

# generate the svymean object for DASHI_ALL
DASHI_1718_svymean = svymean(~DASHI_ALL, design = NHANES_design_1718_d1d2, na.rm = TRUE)
# extract the mean from the svymean object
DASHI_1718_svymean_mean = DASHI_1718_svymean[["DASHI_ALL"]]

# generate the svymean object for MEDI_ALL
MEDI_1718_svymean = svymean(~MEDI_ALL, design = NHANES_design_1718_d1d2, na.rm = TRUE)
# extract the mean from the svymean object
MEDI_1718_svymean_mean = MEDI_1718_svymean[["MEDI_ALL"]]

# generate the svymean object for DASH_ALL
DASH_1718_svymean = svymean(~DASH_ALL, design = NHANES_design_1718_d1d2, na.rm = TRUE)
# extract the mean from the svymean object
DASH_1718_svymean_mean = DASH_1718_svymean[["DASH_ALL"]]

# generate the svymean object for MED_ALL
MED_1718_svymean = svymean(~MED_ALL, design = NHANES_design_1718_d1d2, na.rm = TRUE)
# extract the mean from the svymean object
MED_1718_svymean_mean = MED_1718_svymean[["MED_ALL"]]

# generate the svymean object for AHEI_ALL
AHEI_1718_svymean = svymean(~AHEI_ALL, design = NHANES_design_1718_d1d2, na.rm = TRUE)
# extract the mean from the svymean object
AHEI_1718_svymean_mean = AHEI_1718_svymean[["AHEI_ALL"]]

# generate the svymean object for DII_ALL
DII_1718_svymean = svymean(~DII_ALL, design = NHANES_design_1718_d1d2, na.rm = TRUE)
# extract the mean from the svymean object
DII_1718_svymean_mean = DII_1718_svymean[["DII_ALL"]]

# generate the svymean object for HEI2020_ALL
HEI2020_1718_svymean = svymean(~HEI2020_ALL, design = NHANES_design_1718_d1d2, na.rm = TRUE)
# extract the mean from the svymean object
HEI2020_1718_svymean_mean = HEI2020_1718_svymean[["HEI2020_ALL"]]

# HEI2020 for day 1
## 2017-2018
HEI2020_NHANES_1718_d1 = HEI2020_NHANES_FPED(FPED_PATH=NHANES_20172018$FPED, NUTRIENT_PATH=NHANES_20172018$NUTRIENT, DEMO_PATH=NHANES_20172018$DEMO)
## add year column
HEI2020_NHANES_1718_d1 = HEI2020_NHANES_1718_d1 %>%
    mutate(year = "2017-2018")
# select only necessary columns from FPED data
NHANES_20172018_FPED = NHANES_20172018$FPED %>%
    select(SEQN, SDMVPSU, SDMVSTRA, WTDRD1)
## inner join with the NHANES 2017-2018 day 1 FPED data
HEI2020_NHANES_1718_d1 = inner_join(NHANES_20172018_FPED, HEI2020_NHANES_1718_d1, by = "SEQN")

# perform data filtering for 2005-2016 NHANES data
# filter only the individuals with reliable recall status for FPED, FPED_IND, NUTRIENT, NUTRIENT_IND, FPED2, FPED_IND2, NUTRIENT2, NUTRIENT_IND2
## DAY 1: DR1DRSTZ == 1
# also, filter out those kcal intake < 500 or > 5000
## DAY 1: DR1TKCAL >= 500 & DR1TKCAL <= 5000

## 2015-2016
NHANES_20152016$FPED = NHANES_20152016$FPED %>%
    filter(DR1DRSTZ == 1)
NHANES_20152016$NUTRIENT = NHANES_20152016$NUTRIENT %>%
    filter(DR1DRSTZ == 1) %>%
    filter(DR1TKCAL >= 500 & DR1TKCAL <= 5000)

## 2013-2014
NHANES_20132014$FPED = NHANES_20132014$FPED %>%
    filter(DR1DRSTZ == 1)
NHANES_20132014$NUTRIENT = NHANES_20132014$NUTRIENT %>%
    filter(DR1DRSTZ == 1) %>%
    filter(DR1TKCAL >= 500 & DR1TKCAL <= 5000)

## 2011-2012
NHANES_20112012$FPED = NHANES_20112012$FPED %>%
    filter(DR1DRSTZ == 1)
NHANES_20112012$NUTRIENT = NHANES_20112012$NUTRIENT %>%
    filter(DR1DRSTZ == 1) %>%
    filter(DR1TKCAL >= 500 & DR1TKCAL <= 5000)

## 2009-2010
NHANES_20092010$FPED = NHANES_20092010$FPED %>%
    filter(DR1DRSTZ == 1)
NHANES_20092010$NUTRIENT = NHANES_20092010$NUTRIENT %>%
    filter(DR1DRSTZ == 1) %>%
    filter(DR1TKCAL >= 500 & DR1TKCAL <= 5000)

## 2007-2008
NHANES_20072008$FPED = NHANES_20072008$FPED %>%
    filter(DR1DRSTZ == 1)
NHANES_20072008$NUTRIENT = NHANES_20072008$NUTRIENT %>%
    filter(DR1DRSTZ == 1) %>%
    filter(DR1TKCAL >= 500 & DR1TKCAL <= 5000)

## 2005-2006
NHANES_20052006$FPED = NHANES_20052006$FPED %>%
    filter(DR1DRSTZ == 1)
NHANES_20052006$NUTRIENT = NHANES_20052006$NUTRIENT %>%
    filter(DR1DRSTZ == 1) %>%
    filter(DR1TKCAL >= 500 & DR1TKCAL <= 5000)


## 2015-2016
HEI2020_NHANES_1516_d1 = HEI2020_NHANES_FPED(FPED_PATH=NHANES_20152016$FPED, NUTRIENT_PATH=NHANES_20152016$NUTRIENT, DEMO_PATH=NHANES_20152016$DEMO)
## add year column
HEI2020_NHANES_1516_d1 = HEI2020_NHANES_1516_d1 %>%
    mutate(year = "2015-2016")
# select only necessary columns from FPED data
NHANES_20152016_FPED = NHANES_20152016$FPED %>%
    select(SEQN, SDMVPSU, SDMVSTRA, WTDRD1)
## inner join with the NHANES 2015-2016 day 1 FPED data
HEI2020_NHANES_1516_d1 = inner_join(NHANES_20152016_FPED, HEI2020_NHANES_1516_d1, by = "SEQN")

## 2013-2014
HEI2020_NHANES_1314_d1 = HEI2020_NHANES_FPED(FPED_PATH=NHANES_20132014$FPED, NUTRIENT_PATH=NHANES_20132014$NUTRIENT, DEMO_PATH=NHANES_20132014$DEMO)
## add year column
HEI2020_NHANES_1314_d1 = HEI2020_NHANES_1314_d1 %>%
    mutate(year = "2013-2014")
# select only necessary columns from FPED data
NHANES_20132014_FPED = NHANES_20132014$FPED %>%
    select(SEQN, SDMVPSU, SDMVSTRA, WTDRD1)
## inner join with the NHANES 2013-2014 day 1 FPED data
HEI2020_NHANES_1314_d1 = inner_join(NHANES_20132014_FPED, HEI2020_NHANES_1314_d1, by = "SEQN")

## 2011-2012
HEI2020_NHANES_1112_d1 = HEI2020_NHANES_FPED(FPED_PATH=NHANES_20112012$FPED, NUTRIENT_PATH=NHANES_20112012$NUTRIENT, DEMO_PATH=NHANES_20112012$DEMO)
## add year column
HEI2020_NHANES_1112_d1 = HEI2020_NHANES_1112_d1 %>%
    mutate(year = "2011-2012")
# select only necessary columns from FPED data
NHANES_20112012_FPED = NHANES_20112012$FPED %>%
    select(SEQN, SDMVPSU, SDMVSTRA, WTDRD1)
## inner join with the NHANES 2011-2012 day 1 FPED data
HEI2020_NHANES_1112_d1 = inner_join(NHANES_20112012_FPED, HEI2020_NHANES_1112_d1, by = "SEQN")

## 2009-2010
HEI2020_NHANES_0910_d1 = HEI2020_NHANES_FPED(FPED_PATH=NHANES_20092010$FPED, NUTRIENT_PATH=NHANES_20092010$NUTRIENT, DEMO_PATH=NHANES_20092010$DEMO)
## add year column
HEI2020_NHANES_0910_d1 = HEI2020_NHANES_0910_d1 %>%
    mutate(year = "2009-2010")
# select only necessary columns from FPED data
NHANES_20092010_FPED = NHANES_20092010$FPED %>%
    select(SEQN, SDMVPSU, SDMVSTRA, WTDRD1)
## inner join with the NHANES 2009-2010 day 1 FPED data
HEI2020_NHANES_0910_d1 = inner_join(NHANES_20092010_FPED, HEI2020_NHANES_0910_d1, by = "SEQN")

## 2007-2008
HEI2020_NHANES_0708_d1 = HEI2020_NHANES_FPED(FPED_PATH=NHANES_20072008$FPED, NUTRIENT_PATH=NHANES_20072008$NUTRIENT, DEMO_PATH=NHANES_20072008$DEMO)
## add year column
HEI2020_NHANES_0708_d1 = HEI2020_NHANES_0708_d1 %>%
    mutate(year = "2007-2008")
# select only necessary columns from FPED data
NHANES_20072008_FPED = NHANES_20072008$FPED %>%
    select(SEQN, SDMVPSU, SDMVSTRA, WTDRD1)
## inner join with the NHANES 2007-2008 day 1 FPED data
HEI2020_NHANES_0708_d1 = inner_join(NHANES_20072008_FPED, HEI2020_NHANES_0708_d1, by = "SEQN")

## 2005-2006
HEI2020_NHANES_0506_d1 = HEI2020_NHANES_FPED(FPED_PATH=NHANES_20052006$FPED, NUTRIENT_PATH=NHANES_20052006$NUTRIENT, DEMO_PATH=NHANES_20052006$DEMO)
## add year column
HEI2020_NHANES_0506_d1 = HEI2020_NHANES_0506_d1 %>%
    mutate(year = "2005-2006")
# select only necessary columns from FPED data
NHANES_20052006_FPED = NHANES_20052006$FPED %>%
    select(SEQN, SDMVPSU, SDMVSTRA, WTDRD1)
## inner join with the NHANES 2005-2006 day 1 FPED data
HEI2020_NHANES_0506_d1 = inner_join(NHANES_20052006_FPED, HEI2020_NHANES_0506_d1, by = "SEQN")


# combine all the HEI2020 day 1 data by rows
HEI2020_NHANES_d1 = rbind(HEI2020_NHANES_0506_d1, HEI2020_NHANES_0708_d1, HEI2020_NHANES_0910_d1, HEI2020_NHANES_1112_d1, HEI2020_NHANES_1314_d1, HEI2020_NHANES_1516_d1, HEI2020_NHANES_1718_d1)
# add toddler column
HEI2020_NHANES_d1_2 = HEI2020_NHANES_d1 %>%
    mutate(toddler = ifelse(RIDAGEYR < 2, "toddler", "nontoddler")) %>%
    ## rearrange the column order
    select(SEQN, SDMVPSU, SDMVSTRA, WTDRD1, year, toddler, everything())

# create the survey design for HEI2020 day 1
HEI2020_NHANES_design_d1 = svydesign(
    id = ~SDMVPSU, 
    strata = ~SDMVSTRA, 
    weight = ~WTDRD1, 
    data = HEI2020_NHANES_d1_2, #set up survey design on the full dataset #can restrict at time of analysis 
    nest = TRUE)

Case study 1 plot

Case study 2 plot

## Joining with `by = join_by(year, toddler)`

Case study 3 plot