9  Reconcile Demographic Variables

10 Get Year1 Race Variables

fpath     <- "DATA-PREP/01-year-one/02-data-intermediate/"
fname     <- "wave-01-data-intermediate.csv"
year1_raw <- readr::read_csv( paste0( fpath, fname ) )
# Wrangle Data for Year 1 
# Responses on CEO race and Gender

year1_ceo <- year1_raw %>% 
  dplyr::select(EIN, CEOrace, CEOgender ) %>% 
  dplyr::filter(  ! CEOrace %in% c(-99, NA),
                  ! CEOgender %in% c(-99, NA)  ) %>% 
  dplyr::mutate(
    CEOrace = dplyr::case_match(
      CEOrace,
      1 ~ "AAPI", 2 ~ "Black", 3 ~ "Hisp", 4 ~ "NativeAm", 5 ~ "White", 6 ~ "Bi", 7 ~ "Oth",
      .default = "Oth"
    ),
    CEOgender = dplyr::case_match(
      CEOgender,
      1 ~ "Man", 2 ~ "Woman", 3 ~ "Trans", 4 ~ "NB", 5 ~ "Oth",
      .default = "Oth"
    ),
  race_check = "Yes",
  gender_check = "Yes"
  ) %>% 
  tidyr::pivot_wider(
    names_from = CEOrace,
    names_glue = "CEOrace_{CEOrace}",
    values_from = race_check, 
    values_fill = "No"
  ) %>% 
  tidyr::pivot_wider(
    names_from = CEOgender,
    names_glue = "CEOgender_{CEOgender}",
    values_from = gender_check, 
    values_fill = "No"
  )

# Wrangle Data for Year 1 Responses on Board Chair Race and Gender
year1_board <- year1_raw %>% 
  dplyr::select(EIN, BCrace, BCgender) %>% 
  dplyr::filter(  ! BCrace %in% c(-99, NA),
                  ! BCgender %in% c(-99, NA)) %>% 
  dplyr::mutate(
    BCrace = dplyr::case_match(
      BCrace,
      1 ~ "AAPI", 2 ~ "Black", 3 ~ "Hisp", 4 ~ "NativeAm", 5 ~ "White", 6 ~ "Bi", 7 ~ "Oth",
      .default = "Oth"
    ),
    BCgender = dplyr::case_match(
      BCgender,
      1 ~ "Man", 2 ~ "Woman", 3 ~ "Trans", 4 ~ "NB", 5 ~ "Oth",
      .default = "Oth"
    ),
  race_check = "Yes",
  gender_check = "Yes"
  ) %>% 
  tidyr::pivot_wider(
    names_from = BCrace,
    names_glue = "BChairrace_{BCrace}",
    values_from = race_check, 
    values_fill = "No"
  ) %>% 
  tidyr::pivot_wider(
    names_from = BCgender,
    names_glue = "BChairgender_{BCgender}",
    values_from = gender_check, 
    values_fill = "No"
  )

df.race2021 <- merge( year1_ceo, year1_board, by="EIN", all=TRUE )

11 Get Year2 Race & Gender Variables

fpath <- "DATA-PREP/02-year-two/03-data-final/"
fname <- "YEAR-02-DATA-RESTRICTED-V02.csv"
year2_final <- read.csv( paste0( fpath, fname ) )

# ALL RACE AND GENDER CATEGORIES 
race.vars <- 
c("CEOrace_AAPI", "CEOrace_Black", "CEOrace_Hisp", "CEOrace_NativeAm", 
"CEOrace_White", "CEOrace_Oth", "CEOrace_Text", "CEOgender_Man", 
"CEOgender_Woman", "CEOgender_Trans", "CEOgender_NB", "CEOgender_Oth", 
"CEOgender_specify", "BChairrace_AAPI", "BChairrace_Black", "BChairrace_Hisp", 
"BChairrace_NativeAm", "BChairrace_White", "BChairrace_Oth", 
"BChairrace_Text", "BChairgender_Man", "BChairgender_Woman", 
"BChairgender_Trans", "BChairgender_NB", "BChairgender_Oth", 
"BChairgender_specify")


# select all race variables plus EIN for merge 
df.race2022 <- year2_final[ c("EIN",race.vars) ] 
dupes2 <- df.race2022 %>% group_by(EIN) %>% filter( n()>1 ) %>% ungroup() 

12 Use Year1 Values if No Year2 Response

# drop 2021 values if there are 2022 versions
df.race2021$EIN <- as.numeric( df.race2021$EIN )
df.race2022$EIN <- as.numeric( df.race2022$EIN )
drop <- intersect( df.race2021$EIN, df.race2022$EIN )

# drop %>% length()  531
# nrow( df.race2021 )  1656
# nrow( df.race2022 )  691
#
# total nrow:  1656 + 691 - 531 - 17 (dupes) = 1799

df.race2021 <- 
  df.race2021 %>% 
  dplyr::filter( ! EIN %in% drop )

df.race2022 <- dplyr::filter( df.race2022, ! is.na(EIN) )
df.race2022 <- dplyr::bind_rows( df.race2022, df.race2021 )

# nrow( df.race2022 )  1799

13 Merge 2023 and 2022 Versions

# creates race.vars.y versions:
# CEOrace_AAPI & CEOrace_AAPI.y 

survey_df <- 
  merge( survey_df, df.race2022, 
         by.x="ExternalReference", by.y="EIN", 
         all.x=TRUE )

# remove .x from varname.x versions:

nm <- names( survey_df )
nm <- gsub( "\\.x$", "", nm )
names( survey_df ) <- nm 


###
###   CEO UPDATES
###

# APPLY TO COLUMNS K:
COLUMNS <-  
  c("CEOrace_AAPI", "CEOrace_Black", 
    "CEOrace_Hisp", "CEOrace_NativeAm", 
    "CEOrace_White", 
    "CEOrace_Oth", "CEOrace_Text",
    "CEOgender_Man", "CEOgender_Woman", 
    "CEOgender_Trans", "CEOgender_NB", 
    "CEOgender_Oth", "CEOgender_specify" )

# ALL RESPONSES NA IN GROUP:
# indication that the question
# was skipped by the respondent

all.na <- apply( survey_df[ COLUMNS ], 
                 MARGIN = 1, 
                 FUN = function(x) all(is.na(x)) ) 

CHANGE_2023_X <- ! all.na

# NA means "no" if change occurred

na_to_no <- function(x){
  x[ is.na(x) ] <- "No"
  return(x)
}

survey_df[ COLUMNS ] <- 
  survey_df[ COLUMNS ] %>% 
  lapply( na_to_no )

# if they do not respond "1/yes" 
# to at least one question then
# assume they skipped them all

no_to_na <- function( x, rowid ){
  x[ rowid ] <- NA
  return(x)
}

survey_df[ COLUMNS ] <- 
  survey_df[ COLUMNS ] %>% 
  lapply( no_to_na, rowid=all.na )



# update 2022 data with 2023 responses: 
#   has.change is T/F indicator of any CEO
#   change in 2023

impute_race <- function( x.txt, df, has.change ){
  x.txt.y <- paste0( x.txt, ".y" )
  
  x1 <- df[[ x.txt   ]] # race indicator 2023 1/NA
  x2 <- df[[ x.txt.y ]] # race indicator 2022 Yes/No
  
  # update 2022 values with 2023 values
  x1[ is.na(x1) ] <- ""
  x2[ has.change ] <- x1[ has.change ]
  
  # return updated vector
  return( x2 )
}

survey_df[ COLUMNS ] <- 
  purrr::map( COLUMNS, impute_race, survey_df, CHANGE_2023_X )



###
###   BOARD UPDATES
###

# APPLY TO COLUMNS K:
COLUMNS <-  
  c("BChairrace_AAPI", "BChairrace_Black", 
    "BChairrace_Hisp", "BChairrace_NativeAm", 
    "BChairrace_White", 
    "BChairrace_Oth", "BChairrace_Text", 
    "BChairgender_Man", 
    "BChairgender_Woman", "BChairgender_Trans", 
    "BChairgender_NB", 
    "BChairgender_Oth", "BChairgender_specify" )

# ALL RESPONSES NA IN GROUP:
# indication that the question
# was skipped by the respondent

all.na <- apply( survey_df[ COLUMNS ], 
                 MARGIN = 1, 
                 FUN = function(x) all(is.na(x)) ) 

# any change in 2023
CHANGE_2023_X <- ! all.na

# NA means "no"

na_to_no <- function(x){
  x[ is.na(x) ] <- "No"
  return(x)
}

survey_df[ COLUMNS ] <- 
  survey_df[ COLUMNS ] %>% 
  lapply( na_to_no )

# if they do not respond "1/yes" 
# to at least one question then
# assume they skipped them all

no_to_na <- function( x, rowid ){
  x[ rowid ] <- NA
  return(x)
}

survey_df[ COLUMNS ] <- 
  survey_df[ COLUMNS ] %>% 
  lapply( no_to_na, rowid=all.na )



# update 2022 data with 2023 responses 

impute_race <- function( x.txt, df, has.change ){
  x.txt.y <- paste0( x.txt, ".y" )
  
  x1 <- df[[ x.txt   ]] # race indicator 2023 1/NA
  x2 <- df[[ x.txt.y ]] # race indicator 2022 Yes/No
  
  # update 2022 values with 2023 values
  x1[ is.na(x1) ] <- ""
  x2[ has.change ] <- x1[ has.change ]
  
  # return updated vector
  return( x2 )
}

survey_df[ COLUMNS ] <- 
  purrr::map( COLUMNS, impute_race, survey_df, CHANGE_2023_X )



# change varname.y to varname.2022

nm <- names( survey_df )
nm <- gsub( "\\.y$", "_v2022", nm )
names( survey_df ) <- nm 

14 Create Biracial Categories in Year 3

Create a Biracial indicator variable in the Year 3 data if a CEO or Board Chair belongs to 2 or more racial groups.

# CEOs

ceo.race.qns <- grep( "CEOrace", race_gender_qns_bool, value=T  )

# COUNT YES'S
num.races.ceo <-  
  apply( survey_df[ ceo.race.qns ], 
         MARGIN = 1, 
         FUN = function(x){ sum( x %in% c("1","Yes"), na.rm=T ) } ) 

survey_df$CEOrace_Bi <- ifelse( num.races.ceo > 1, "Yes","No" )

# OMIT ROWS WITH NO RESPONSE 

is_missing <- function(x){
  # can be NA or ""
  x[ is.na(x) ] <- ""
  return( x == "" )
}

all.missing <- 
  apply( survey_df[ ceo.race.qns ], 
         MARGIN = 1, 
         FUN = function(x){ all( is_missing(x) ) } ) 

survey_df$CEOrace_Bi[ all.missing ] <- ""

# BOARD CHAIRs

board.race.qns <- grep( "BChairrace", race_gender_qns_bool, value=T )

# COUNT YES
num.races.chair <-  
  apply( survey_df[ board.race.qns ], 
          MARGIN = 1, 
          FUN = function(x){ sum( x %in% c("1","Yes"), na.rm=T ) } ) 

survey_df$BChairrace_Bi <- ifelse( num.races.chair > 1, "Yes","No" )

# OMIT ROWS WITH NO RESPONSE 
all.missing <- 
  apply( survey_df[ board.race.qns  ], 
         MARGIN = 1, 
         FUN = function(x){ all( is_missing(x) ) } )  

survey_df$BChairrace_Bi[ all.missing ] <- ""


# MAKE MEMISC ITEM 

biracial_qns <- c( "CEOrace_Bi","BChairrace_Bi" )

# Create Survey Item
survey_df <- create_survey_item(
  survey_df, 
  biracial_qns, 
  recode_vals = c("Yes","No", "" ), 
  recode_labs = c("Yes","No", "Missing" ), 
  missing_vals = ""
)

Example:

BChairrace_Bi

Storage mode: character
Measurement: nominal
Missing values:

Values and labels N Valid Total
M ‘Missing’ 94 18 . 7
No ‘No’ 401 98 . 3 79 . 9
Yes ‘Yes’ 7 1 . 7 1 . 4

15 Create Combined Race Field

While our race and gender columns are individual binary columns, this is not the case for year 1 and 3 results. Both those years contain single columns for race and gender variables for CEOs and Board Chairs respectively. For easier comparability, we create a new variable aggregating race and gender values for all our individual binary columns.

15.1 Race and Gender Checkboxes

These checkboxes are ticked by the respondent to indicate if their CEO or board chair belong to a specified race or gender identity.

Original Value Description Recode Label Recode Value Code as Missing ?
Asian/Pacific Islander Checkbox Checked Yes 1 No
Black/African American Checkbox Checked Yes 1 No
Latinx/Hispanic Checkbox Checked Yes 1 No
Native American/American Indian Checkbox Checked Yes 1 No
White Checkbox Checked Yes 1 No
Man Checkbox Checked Yes 1 No
Woman Checkbox Checked Yes 1 No
Trans Checkbox Checked Yes 1 No
Gender non-conforming/Non-Binary Checkbox Checked Yes 1 No
Other (please specify) Checkbox Checked Yes 1 No
0 Checkbox Unchecked No 0 No
NA Unanswered NA NA Yes
# APPLY TO COLUMNS K:
COLUMNS <-  race_gender_qns_bool


# VALUES THAT NEED RECODING
 
RULES <- c(     "    1    =>>      Yes     ", 
                "  Yes    =>>      Yes     ", 
                "   No    =>>       No     ",
                "   NA    =>>      _NA_    "   )


rules <- parse_rules( RULES )
pattern <- rules[[ "pattern" ]]
replace <- rules[[ "replace" ]]

# MEMISC LABELS AND MISSING VALUE CODES
values  <- c( "No", "Yes", "", "_NA_" )
labels  <- c( "No", "Yes", "Missing in 2022", "No Response" )
missing <- c( "", "_NA_" )

# RECODE VARIABLES
survey_df <-
  survey_df %>%
  recode_columns( k=COLUMNS, pattern, replace, values, labels, missing )

Example:

CEOrace_White‘TYPE: boolean’

“Race of CEO”

Storage mode: character
Measurement: nominal
Missing values: , NA

Values and labels N Valid Total
M ‘Missing in 2022’ 59 11 . 8
NA M ‘No Response’ 54 10 . 8
No ‘No’ 42 10 . 8 8 . 4
Yes ‘Yes’ 347 89 . 2 69 . 1

QUESTION TXT:
Which of the following best describes the race/ethnicity for your organization’s current Chief Executive (i.e., Executive Director or CEO)? (Select all that apply) - Selected Choice - White

15.2 Aggregated Race Variable

Create a single categorical field for Race.

Original Value Description Recode Label Recode Value Code as Missing ?
‘Yes’ in CEOrace_AAPI Asian/Pacific Islander AAPI 1 No
‘Yes’ in CEOrace_Black Black/African American Black 2 No
‘Yes’ in CEOrace_Hisp Latinx/Hispanic Hispanic 3 No
‘Yes’ in CEOrace_NativeAm Native American/American Indian NativeAm 4 No
‘Yes’ in CEOrace_White White White 5 No
‘Yes’ in CEOrace_Oth Other (please specify) Other 6 No
‘Yes’ in CEOrace_Bi Bi/Multi-racial Biracial 7 No
No Reponse No race reported NA NA Yes
# Create New Race variables
survey_df <- survey_df %>%
  dplyr::mutate(
    CEOrace = dplyr::case_when(
      CEOrace_AAPI == 'Yes' ~ 1,
      CEOrace_Black == 'Yes' ~ 2,
      CEOrace_Hisp == 'Yes' ~ 3,
      CEOrace_NativeAm == 'Yes' ~ 4,
      CEOrace_White == 'Yes' ~ 5,
      CEOrace_Oth == 'Yes' ~ 6,
      CEOrace_Bi == 'Yes' ~ 7,
      .default = NA
    ),
    BChairrace = dplyr::case_when(
      BChairrace_AAPI == 'Yes' ~ 1,
      BChairrace_Black == 'Yes' ~ 2,
      BChairrace_Hisp == 'Yes' ~ 3,
      BChairrace_NativeAm == 'Yes' ~ 4,
      BChairrace_White == 'Yes' ~ 5,
      BChairrace_Oth == 'Yes' ~ 6,
      BChairrace_Bi == 'Yes' ~ 7,
      .default = NA
  ))

# Manage any coding conflicts: 
survey_df$CEOrace[ survey_df$CEOrace_Bi == "Yes" ]       <- 7
survey_df$BChairrace[ survey_df$BChairrace_Bi == "Yes" ] <- 7

# APPLY TO COLUMNS K:
race.categories <- c("CEOrace", "BChairrace")
COLUMNS <- race.categories

# VALUES THAT NEED RECODING
 
RULES <- c(   "   NA    =>>    _NA_    "   )

rules <- parse_rules( RULES )
pattern <- rules[[ "pattern" ]]
replace <- rules[[ "replace" ]]

# MEMISC LABELS AND MISSING VALUE CODES
values  <- c( 1, 2, 3, 4, 5, 6, 7, "_NA_" )
labels  <- c( "AAPI", "Black", "Hispanic", "NativeAm", 
              "White", "Other","Biracial", "No Response" )
missing <- "_NA_"


dd.add <- 
  data.frame( vname=c("CEOrace", "BChairrace"),
              vlabel=c("Race of the CEO","Race of the Board Chair"),
              type=c("Categorical","Categorical"),
              description=c("Race categories combined into a single factor") )

dd <- dplyr::bind_rows( dd, dd.add )

# RECODE VARIABLES
survey_df <-
  survey_df %>%
  recode_columns( k=COLUMNS, pattern, replace, values, labels, missing )

Example:

CEOrace‘TYPE: Categorical’

“Race of the CEO”

Storage mode: character
Measurement: nominal
Missing values: NA

Values and labels N Valid Total
NA M ‘No Response’ 115 22 . 9
1 ‘AAPI’ 5 1 . 3 1 . 0
2 ‘Black’ 20 5 . 2 4 . 0
3 ‘Hispanic’ 6 1 . 6 1 . 2
4 ‘NativeAm’ 3 0 . 8 0 . 6
5 ‘White’ 344 88 . 9 68 . 5
6 ‘Other’ 6 1 . 6 1 . 2
7 ‘Biracial’ 3 0 . 8 0 . 6

QUESTION TXT:
Race categories combined into a single factor

16 Create Combined Gender Field

Original Value Description Recode Label Recode Value Code as Missing ?
‘Yes’ in CEOgender_Man Man Man 1 No
‘Yes’ in CEOgender_Woman Woman Woman 2 No
‘Yes’ in CEOgender_Trans Trans Trans 3 No
‘Yes’ in CEOgender_NB Gender non-conforming/Non-Binary NB 4 No
‘Yes’ in CEOgender_Oth Other (please specify) Other 5 No
No response No gender reported NA NA Yes
survey_df <- survey_df %>%
  dplyr::mutate(
    CEOgender = dplyr::case_when(
      CEOgender_Man == 'Yes' ~ 1,
      CEOgender_Woman == 'Yes' ~ 2,
      CEOgender_Trans == 'Yes' ~ 3,
      CEOgender_NB == 'Yes' ~ 4,
      CEOgender_Oth == 'Yes' ~ 5,
      .default = NA
    ),
    BChairgender = dplyr::case_when(
      BChairgender_Man == 'Yes' ~ 1,
      BChairgender_Woman == 'Yes' ~ 2,
      BChairgender_Trans == 'Yes' ~ 3,
      BChairgender_NB == 'Yes' ~ 4,
      BChairgender_Oth == 'Yes' ~ 5,
      .default = NA
  ))

# APPLY TO COLUMNS K:
COLUMNS <- c("CEOgender", "BChairgender")

# VALUES THAT NEED RECODING
RULES <- c(   "   NA    =>>    _NA_    "   )
rules <- parse_rules( RULES )
pattern <- rules[[ "pattern" ]]
replace <- rules[[ "replace" ]]

# MEMISC LABELS AND MISSING VALUE CODES
values  <- c( 1, 2, 3, 4, 5, "_NA_" )
labels  <- c( "Man", "Woman", "Trans", "NB", "Other", "No Response" )
missing <- "_NA_"


dd.add <- 
  data.frame( vname=c("CEOgender", "BChairgender"),
              vlabel=c("Race of the CEO","Race of the Board Chair"),
              type=c("Categorical","Categorical"),
              description=c("Race categories combined into a single factor") )

dd <- dplyr::bind_rows( dd, dd.add )

# RECODE VARIABLES
survey_df <-
  survey_df %>%
  recode_columns( k=COLUMNS, pattern, replace, values, labels, missing )

Example:

CEOgender‘TYPE: Categorical’

“Race of the CEO”

Storage mode: character
Measurement: nominal
Missing values: NA

Values and labels N Valid Total
NA M ‘No Response’ 113 22 . 5
1 ‘Man’ 67 17 . 2 13 . 3
2 ‘Woman’ 316 81 . 2 62 . 9
3 ‘Trans’ 1 0 . 3 0 . 2
4 ‘NB’ 3 0 . 8 0 . 6
5 ‘Other’ 2 0 . 5 0 . 4

QUESTION TXT:
Race categories combined into a single factor

16.1 Save outputs

fpath <- "DATA-PREP/03-year-three/03-data-final/"
fname <- "YEAR-03-DATA-RESTRICTED-V04.csv"
write.csv( survey_df, paste0( fpath, fname ), row.names=F, na="" )