9 Reconcile Demographic Variables

10 Get Year1 Race Variables

fpath     <- "DATA-PREP/01-year-one/02-data-intermediate/"
fname     <- "wave-01-data-intermediate.csv"
year1_raw <- readr::read_csv( paste0( fpath, fname ) )

# Wrangle Data for Year 1 
# Responses on CEO race and Gender

year1_ceo <- year1_raw %>% 
  dplyr::select(EIN, CEOrace, CEOgender ) %>% 
  dplyr::filter(  ! CEOrace %in% c(-99, NA),
                  ! CEOgender %in% c(-99, NA)  ) %>% 
  dplyr::mutate(
    CEOrace = dplyr::case_match(
      CEOrace,
      1 ~ "AAPI", 2 ~ "Black", 3 ~ "Hisp", 4 ~ "NativeAm", 5 ~ "White", 6 ~ "Bi", 7 ~ "Oth",
      .default = "Oth"
    ),
    CEOgender = dplyr::case_match(
      CEOgender,
      1 ~ "Man", 2 ~ "Woman", 3 ~ "Trans", 4 ~ "NB", 5 ~ "Oth",
      .default = "Oth"
    ),
  race_check = "Yes",
  gender_check = "Yes"
  ) %>% 
  tidyr::pivot_wider(
    names_from = CEOrace,
    names_glue = "CEOrace_{CEOrace}",
    values_from = race_check, 
    values_fill = "No"
  ) %>% 
  tidyr::pivot_wider(
    names_from = CEOgender,
    names_glue = "CEOgender_{CEOgender}",
    values_from = gender_check, 
    values_fill = "No"
  )

# Wrangle Data for Year 1 Responses on Board Chair Race and Gender
year1_board <- year1_raw %>% 
  dplyr::select(EIN, BCrace, BCgender) %>% 
  dplyr::filter(  ! BCrace %in% c(-99, NA),
                  ! BCgender %in% c(-99, NA)) %>% 
  dplyr::mutate(
    BCrace = dplyr::case_match(
      BCrace,
      1 ~ "AAPI", 2 ~ "Black", 3 ~ "Hisp", 4 ~ "NativeAm", 5 ~ "White", 6 ~ "Bi", 7 ~ "Oth",
      .default = "Oth"
    ),
    BCgender = dplyr::case_match(
      BCgender,
      1 ~ "Man", 2 ~ "Woman", 3 ~ "Trans", 4 ~ "NB", 5 ~ "Oth",
      .default = "Oth"
    ),
  race_check = "Yes",
  gender_check = "Yes"
  ) %>% 
  tidyr::pivot_wider(
    names_from = BCrace,
    names_glue = "BChairrace_{BCrace}",
    values_from = race_check, 
    values_fill = "No"
  ) %>% 
  tidyr::pivot_wider(
    names_from = BCgender,
    names_glue = "BChairgender_{BCgender}",
    values_from = gender_check, 
    values_fill = "No"
  )

df.race2021 <- merge( year1_ceo, year1_board, by="EIN", all=TRUE )

11 Get Year2 Race & Gender Variables

fpath <- "DATA-PREP/02-year-two/03-data-final/"
fname <- "YEAR-02-DATA-RESTRICTED-V02.csv"
year2_final <- read.csv( paste0( fpath, fname ) )

# ALL RACE AND GENDER CATEGORIES 
race.vars <- 
c("CEOrace_AAPI", "CEOrace_Black", "CEOrace_Hisp", "CEOrace_NativeAm", 
"CEOrace_White", "CEOrace_Oth", "CEOrace_Text", "CEOgender_Man", 
"CEOgender_Woman", "CEOgender_Trans", "CEOgender_NB", "CEOgender_Oth", 
"CEOgender_specify", "BChairrace_AAPI", "BChairrace_Black", "BChairrace_Hisp", 
"BChairrace_NativeAm", "BChairrace_White", "BChairrace_Oth", 
"BChairrace_Text", "BChairgender_Man", "BChairgender_Woman", 
"BChairgender_Trans", "BChairgender_NB", "BChairgender_Oth", 
"BChairgender_specify")


# select all race variables plus EIN for merge 
df.race2022 <- year2_final[ c("EIN",race.vars) ]

dupes2 <- df.race2022 %>% group_by(EIN) %>% filter( n()>1 ) %>% ungroup()

12 Use Year1 Values if No Year2 Response

# drop 2021 values if there are 2022 versions
df.race2021$EIN <- as.numeric( df.race2021$EIN )
df.race2022$EIN <- as.numeric( df.race2022$EIN )
drop <- intersect( df.race2021$EIN, df.race2022$EIN )

# drop %>% length()  531
# nrow( df.race2021 )  1656
# nrow( df.race2022 )  691
#
# total nrow:  1656 + 691 - 531 - 17 (dupes) = 1799

df.race2021 <- 
  df.race2021 %>% 
  dplyr::filter( ! EIN %in% drop )

df.race2022 <- dplyr::filter( df.race2022, ! is.na(EIN) )
df.race2022 <- dplyr::bind_rows( df.race2022, df.race2021 )

# nrow( df.race2022 )  1799

13 Merge 2023 and 2022 Versions

# creates race.vars.y versions:
# CEOrace_AAPI & CEOrace_AAPI.y 

survey_df <- 
  merge( survey_df, df.race2022, 
         by.x="ExternalReference", by.y="EIN", 
         all.x=TRUE )

# remove .x from varname.x versions:

nm <- names( survey_df )
nm <- gsub( "\\.x$", "", nm )
names( survey_df ) <- nm 


###
###   CEO UPDATES
###

# APPLY TO COLUMNS K:
COLUMNS <-  
  c("CEOrace_AAPI", "CEOrace_Black", 
    "CEOrace_Hisp", "CEOrace_NativeAm", 
    "CEOrace_White", 
    "CEOrace_Oth", "CEOrace_Text",
    "CEOgender_Man", "CEOgender_Woman", 
    "CEOgender_Trans", "CEOgender_NB", 
    "CEOgender_Oth", "CEOgender_specify" )

# ALL RESPONSES NA IN GROUP:
# indication that the question
# was skipped by the respondent

all.na <- apply( survey_df[ COLUMNS ], 
                 MARGIN = 1, 
                 FUN = function(x) all(is.na(x)) ) 

CHANGE_2023_X <- ! all.na

# NA means "no" if change occurred

na_to_no <- function(x){
  x[ is.na(x) ] <- "No"
  return(x)
}

survey_df[ COLUMNS ] <- 
  survey_df[ COLUMNS ] %>% 
  lapply( na_to_no )

# if they do not respond "1/yes" 
# to at least one question then
# assume they skipped them all

no_to_na <- function( x, rowid ){
  x[ rowid ] <- NA
  return(x)
}

survey_df[ COLUMNS ] <- 
  survey_df[ COLUMNS ] %>% 
  lapply( no_to_na, rowid=all.na )



# update 2022 data with 2023 responses: 
#   has.change is T/F indicator of any CEO
#   change in 2023

impute_race <- function( x.txt, df, has.change ){
  x.txt.y <- paste0( x.txt, ".y" )
  
  x1 <- df[[ x.txt   ]] # race indicator 2023 1/NA
  x2 <- df[[ x.txt.y ]] # race indicator 2022 Yes/No
  
  # update 2022 values with 2023 values
  x1[ is.na(x1) ] <- ""
  x2[ has.change ] <- x1[ has.change ]
  
  # return updated vector
  return( x2 )
}

survey_df[ COLUMNS ] <- 
  purrr::map( COLUMNS, impute_race, survey_df, CHANGE_2023_X )



###
###   BOARD UPDATES
###

# APPLY TO COLUMNS K:
COLUMNS <-  
  c("BChairrace_AAPI", "BChairrace_Black", 
    "BChairrace_Hisp", "BChairrace_NativeAm", 
    "BChairrace_White", 
    "BChairrace_Oth", "BChairrace_Text", 
    "BChairgender_Man", 
    "BChairgender_Woman", "BChairgender_Trans", 
    "BChairgender_NB", 
    "BChairgender_Oth", "BChairgender_specify" )

# ALL RESPONSES NA IN GROUP:
# indication that the question
# was skipped by the respondent

all.na <- apply( survey_df[ COLUMNS ], 
                 MARGIN = 1, 
                 FUN = function(x) all(is.na(x)) ) 

# any change in 2023
CHANGE_2023_X <- ! all.na

# NA means "no"

na_to_no <- function(x){
  x[ is.na(x) ] <- "No"
  return(x)
}

survey_df[ COLUMNS ] <- 
  survey_df[ COLUMNS ] %>% 
  lapply( na_to_no )

# if they do not respond "1/yes" 
# to at least one question then
# assume they skipped them all

no_to_na <- function( x, rowid ){
  x[ rowid ] <- NA
  return(x)
}

survey_df[ COLUMNS ] <- 
  survey_df[ COLUMNS ] %>% 
  lapply( no_to_na, rowid=all.na )



# update 2022 data with 2023 responses 

impute_race <- function( x.txt, df, has.change ){
  x.txt.y <- paste0( x.txt, ".y" )
  
  x1 <- df[[ x.txt   ]] # race indicator 2023 1/NA
  x2 <- df[[ x.txt.y ]] # race indicator 2022 Yes/No
  
  # update 2022 values with 2023 values
  x1[ is.na(x1) ] <- ""
  x2[ has.change ] <- x1[ has.change ]
  
  # return updated vector
  return( x2 )
}

survey_df[ COLUMNS ] <- 
  purrr::map( COLUMNS, impute_race, survey_df, CHANGE_2023_X )



# change varname.y to varname.2022

nm <- names( survey_df )
nm <- gsub( "\\.y$", "_v2022", nm )
names( survey_df ) <- nm

14 Create Biracial Categories in Year 3

Create a Biracial indicator variable in the Year 3 data if a CEO or Board Chair belongs to 2 or more racial groups.

# CEOs

ceo.race.qns <- grep( "CEOrace", race_gender_qns_bool, value=T  )

# COUNT YES'S
num.races.ceo <-  
  apply( survey_df[ ceo.race.qns ], 
         MARGIN = 1, 
         FUN = function(x){ sum( x %in% c("1","Yes"), na.rm=T ) } ) 

survey_df$CEOrace_Bi <- ifelse( num.races.ceo > 1, "Yes","No" )

# OMIT ROWS WITH NO RESPONSE 

is_missing <- function(x){
  # can be NA or ""
  x[ is.na(x) ] <- ""
  return( x == "" )
}

all.missing <- 
  apply( survey_df[ ceo.race.qns ], 
         MARGIN = 1, 
         FUN = function(x){ all( is_missing(x) ) } ) 

survey_df$CEOrace_Bi[ all.missing ] <- ""

# BOARD CHAIRs

board.race.qns <- grep( "BChairrace", race_gender_qns_bool, value=T )

# COUNT YES
num.races.chair <-  
  apply( survey_df[ board.race.qns ], 
          MARGIN = 1, 
          FUN = function(x){ sum( x %in% c("1","Yes"), na.rm=T ) } ) 

survey_df$BChairrace_Bi <- ifelse( num.races.chair > 1, "Yes","No" )

# OMIT ROWS WITH NO RESPONSE 
all.missing <- 
  apply( survey_df[ board.race.qns  ], 
         MARGIN = 1, 
         FUN = function(x){ all( is_missing(x) ) } )  

survey_df$BChairrace_Bi[ all.missing ] <- ""


# MAKE MEMISC ITEM 

biracial_qns <- c( "CEOrace_Bi","BChairrace_Bi" )

# Create Survey Item
survey_df <- create_survey_item(
  survey_df, 
  biracial_qns, 
  recode_vals = c("Yes","No", "" ), 
  recode_labs = c("Yes","No", "Missing" ), 
  missing_vals = ""
)

Example:

BChairrace_Bi

Storage mode:	character
Measurement:	nominal
Missing values:

Values and labels		N	Valid			Total
M	‘Missing’	94				18	.	7
No	‘No’	401	98	.	3	79	.	9
Yes	‘Yes’	7	1	.	7	1	.	4

15 Create Combined Race Field

While our race and gender columns are individual binary columns, this is not the case for year 1 and 3 results. Both those years contain single columns for race and gender variables for CEOs and Board Chairs respectively. For easier comparability, we create a new variable aggregating race and gender values for all our individual binary columns.

15.1 Race and Gender Checkboxes

These checkboxes are ticked by the respondent to indicate if their CEO or board chair belong to a specified race or gender identity.

Original Value	Description	Recode Label	Recode Value	Code as Missing ?
Asian/Pacific Islander	Checkbox Checked	Yes	1	No
Black/African American	Checkbox Checked	Yes	1	No
Latinx/Hispanic	Checkbox Checked	Yes	1	No
Native American/American Indian	Checkbox Checked	Yes	1	No
White	Checkbox Checked	Yes	1	No
Man	Checkbox Checked	Yes	1	No
Woman	Checkbox Checked	Yes	1	No
Trans	Checkbox Checked	Yes	1	No
Gender non-conforming/Non-Binary	Checkbox Checked	Yes	1	No
Other (please specify)	Checkbox Checked	Yes	1	No
0	Checkbox Unchecked	No	0	No
NA	Unanswered	NA	NA	Yes

# APPLY TO COLUMNS K:
COLUMNS <-  race_gender_qns_bool


# VALUES THAT NEED RECODING
 
RULES <- c(     "    1    =>>      Yes     ", 
                "  Yes    =>>      Yes     ", 
                "   No    =>>       No     ",
                "   NA    =>>      _NA_    "   )


rules <- parse_rules( RULES )
pattern <- rules[[ "pattern" ]]
replace <- rules[[ "replace" ]]

# MEMISC LABELS AND MISSING VALUE CODES
values  <- c( "No", "Yes", "", "_NA_" )
labels  <- c( "No", "Yes", "Missing in 2022", "No Response" )
missing <- c( "", "_NA_" )

# RECODE VARIABLES
survey_df <-
  survey_df %>%
  recode_columns( k=COLUMNS, pattern, replace, values, labels, missing )

Example:

CEOrace_White — ‘TYPE: boolean’

“Race of CEO”

Storage mode:	character
Measurement:	nominal
Missing values:	, NA

Values and labels			N	Valid			Total
M		‘Missing in 2022’	59				11	.	8
NA	M	‘No Response’	54				10	.	8
No		‘No’	42	10	.	8	8	.	4
Yes		‘Yes’	347	89	.	2	69	.	1

QUESTION TXT:: Which of the following best describes the race/ethnicity for your organization’s current Chief Executive (i.e., Executive Director or CEO)? (Select all that apply) - Selected Choice - White

15.2 Aggregated Race Variable

Create a single categorical field for Race.

Original Value	Description	Recode Label	Recode Value	Code as Missing ?
‘Yes’ in CEOrace_AAPI	Asian/Pacific Islander	AAPI	1	No
‘Yes’ in CEOrace_Black	Black/African American	Black	2	No
‘Yes’ in CEOrace_Hisp	Latinx/Hispanic	Hispanic	3	No
‘Yes’ in CEOrace_NativeAm	Native American/American Indian	NativeAm	4	No
‘Yes’ in CEOrace_White	White	White	5	No
‘Yes’ in CEOrace_Oth	Other (please specify)	Other	6	No
‘Yes’ in CEOrace_Bi	Bi/Multi-racial	Biracial	7	No
No Reponse	No race reported	NA	NA	Yes

# Create New Race variables
survey_df <- survey_df %>%
  dplyr::mutate(
    CEOrace = dplyr::case_when(
      CEOrace_AAPI == 'Yes' ~ 1,
      CEOrace_Black == 'Yes' ~ 2,
      CEOrace_Hisp == 'Yes' ~ 3,
      CEOrace_NativeAm == 'Yes' ~ 4,
      CEOrace_White == 'Yes' ~ 5,
      CEOrace_Oth == 'Yes' ~ 6,
      CEOrace_Bi == 'Yes' ~ 7,
      .default = NA
    ),
    BChairrace = dplyr::case_when(
      BChairrace_AAPI == 'Yes' ~ 1,
      BChairrace_Black == 'Yes' ~ 2,
      BChairrace_Hisp == 'Yes' ~ 3,
      BChairrace_NativeAm == 'Yes' ~ 4,
      BChairrace_White == 'Yes' ~ 5,
      BChairrace_Oth == 'Yes' ~ 6,
      BChairrace_Bi == 'Yes' ~ 7,
      .default = NA
  ))

# Manage any coding conflicts: 
survey_df$CEOrace[ survey_df$CEOrace_Bi == "Yes" ]       <- 7
survey_df$BChairrace[ survey_df$BChairrace_Bi == "Yes" ] <- 7

# APPLY TO COLUMNS K:
race.categories <- c("CEOrace", "BChairrace")
COLUMNS <- race.categories

# VALUES THAT NEED RECODING
 
RULES <- c(   "   NA    =>>    _NA_    "   )

rules <- parse_rules( RULES )
pattern <- rules[[ "pattern" ]]
replace <- rules[[ "replace" ]]

# MEMISC LABELS AND MISSING VALUE CODES
values  <- c( 1, 2, 3, 4, 5, 6, 7, "_NA_" )
labels  <- c( "AAPI", "Black", "Hispanic", "NativeAm", 
              "White", "Other","Biracial", "No Response" )
missing <- "_NA_"


dd.add <- 
  data.frame( vname=c("CEOrace", "BChairrace"),
              vlabel=c("Race of the CEO","Race of the Board Chair"),
              type=c("Categorical","Categorical"),
              description=c("Race categories combined into a single factor") )

dd <- dplyr::bind_rows( dd, dd.add )

# RECODE VARIABLES
survey_df <-
  survey_df %>%
  recode_columns( k=COLUMNS, pattern, replace, values, labels, missing )

Example:

CEOrace — ‘TYPE: Categorical’

“Race of the CEO”

Storage mode:	character
Measurement:	nominal
Missing values:	NA

Values and labels			N	Valid			Total
NA	M	‘No Response’	115				22	.	9
1		‘AAPI’	5	1	.	3	1	.	0
2		‘Black’	20	5	.	2	4	.	0
3		‘Hispanic’	6	1	.	6	1	.	2
4		‘NativeAm’	3	0	.	8	0	.	6
5		‘White’	344	88	.	9	68	.	5
6		‘Other’	6	1	.	6	1	.	2
7		‘Biracial’	3	0	.	8	0	.	6

QUESTION TXT:: Race categories combined into a single factor

16 Create Combined Gender Field

Original Value	Description	Recode Label	Recode Value	Code as Missing ?
‘Yes’ in CEOgender_Man	Man	Man	1	No
‘Yes’ in CEOgender_Woman	Woman	Woman	2	No
‘Yes’ in CEOgender_Trans	Trans	Trans	3	No
‘Yes’ in CEOgender_NB	Gender non-conforming/Non-Binary	NB	4	No
‘Yes’ in CEOgender_Oth	Other (please specify)	Other	5	No
No response	No gender reported	NA	NA	Yes

survey_df <- survey_df %>%
  dplyr::mutate(
    CEOgender = dplyr::case_when(
      CEOgender_Man == 'Yes' ~ 1,
      CEOgender_Woman == 'Yes' ~ 2,
      CEOgender_Trans == 'Yes' ~ 3,
      CEOgender_NB == 'Yes' ~ 4,
      CEOgender_Oth == 'Yes' ~ 5,
      .default = NA
    ),
    BChairgender = dplyr::case_when(
      BChairgender_Man == 'Yes' ~ 1,
      BChairgender_Woman == 'Yes' ~ 2,
      BChairgender_Trans == 'Yes' ~ 3,
      BChairgender_NB == 'Yes' ~ 4,
      BChairgender_Oth == 'Yes' ~ 5,
      .default = NA
  ))

# APPLY TO COLUMNS K:
COLUMNS <- c("CEOgender", "BChairgender")

# VALUES THAT NEED RECODING
RULES <- c(   "   NA    =>>    _NA_    "   )
rules <- parse_rules( RULES )
pattern <- rules[[ "pattern" ]]
replace <- rules[[ "replace" ]]

# MEMISC LABELS AND MISSING VALUE CODES
values  <- c( 1, 2, 3, 4, 5, "_NA_" )
labels  <- c( "Man", "Woman", "Trans", "NB", "Other", "No Response" )
missing <- "_NA_"


dd.add <- 
  data.frame( vname=c("CEOgender", "BChairgender"),
              vlabel=c("Race of the CEO","Race of the Board Chair"),
              type=c("Categorical","Categorical"),
              description=c("Race categories combined into a single factor") )

dd <- dplyr::bind_rows( dd, dd.add )

# RECODE VARIABLES
survey_df <-
  survey_df %>%
  recode_columns( k=COLUMNS, pattern, replace, values, labels, missing )

Example:

CEOgender — ‘TYPE: Categorical’

“Race of the CEO”

Storage mode:	character
Measurement:	nominal
Missing values:	NA

Values and labels			N	Valid			Total
NA	M	‘No Response’	113				22	.	5
1		‘Man’	67	17	.	2	13	.	3
2		‘Woman’	316	81	.	2	62	.	9
3		‘Trans’	1	0	.	3	0	.	2
4		‘NB’	3	0	.	8	0	.	6
5		‘Other’	2	0	.	5	0	.	4

QUESTION TXT:: Race categories combined into a single factor

16.1 Save outputs

fpath <- "DATA-PREP/03-year-three/03-data-final/"
fname <- "YEAR-03-DATA-RESTRICTED-V05.csv"
write.csv( survey_df, paste0( fpath, fname ), row.names=F, na="" )