fpath <- "DATA-PREP/01-year-one/02-data-intermediate/"
fname <- "wave-01-data-intermediate.csv"
year1_raw <- readr::read_csv( paste0( fpath, fname ) )9 Reconcile Demographic Variables
10 Get Year1 Race Variables
# Wrangle Data for Year 1
# Responses on CEO race and Gender
year1_ceo <- year1_raw %>%
dplyr::select(EIN, CEOrace, CEOgender ) %>%
dplyr::filter( ! CEOrace %in% c(-99, NA),
! CEOgender %in% c(-99, NA) ) %>%
dplyr::mutate(
CEOrace = dplyr::case_match(
CEOrace,
1 ~ "AAPI", 2 ~ "Black", 3 ~ "Hisp", 4 ~ "NativeAm", 5 ~ "White", 6 ~ "Bi", 7 ~ "Oth",
.default = "Oth"
),
CEOgender = dplyr::case_match(
CEOgender,
1 ~ "Man", 2 ~ "Woman", 3 ~ "Trans", 4 ~ "NB", 5 ~ "Oth",
.default = "Oth"
),
race_check = "Yes",
gender_check = "Yes"
) %>%
tidyr::pivot_wider(
names_from = CEOrace,
names_glue = "CEOrace_{CEOrace}",
values_from = race_check,
values_fill = "No"
) %>%
tidyr::pivot_wider(
names_from = CEOgender,
names_glue = "CEOgender_{CEOgender}",
values_from = gender_check,
values_fill = "No"
)
# Wrangle Data for Year 1 Responses on Board Chair Race and Gender
year1_board <- year1_raw %>%
dplyr::select(EIN, BCrace, BCgender) %>%
dplyr::filter( ! BCrace %in% c(-99, NA),
! BCgender %in% c(-99, NA)) %>%
dplyr::mutate(
BCrace = dplyr::case_match(
BCrace,
1 ~ "AAPI", 2 ~ "Black", 3 ~ "Hisp", 4 ~ "NativeAm", 5 ~ "White", 6 ~ "Bi", 7 ~ "Oth",
.default = "Oth"
),
BCgender = dplyr::case_match(
BCgender,
1 ~ "Man", 2 ~ "Woman", 3 ~ "Trans", 4 ~ "NB", 5 ~ "Oth",
.default = "Oth"
),
race_check = "Yes",
gender_check = "Yes"
) %>%
tidyr::pivot_wider(
names_from = BCrace,
names_glue = "BChairrace_{BCrace}",
values_from = race_check,
values_fill = "No"
) %>%
tidyr::pivot_wider(
names_from = BCgender,
names_glue = "BChairgender_{BCgender}",
values_from = gender_check,
values_fill = "No"
)
df.race2021 <- merge( year1_ceo, year1_board, by="EIN", all=TRUE )11 Get Year2 Race & Gender Variables
fpath <- "DATA-PREP/02-year-two/03-data-final/"
fname <- "YEAR-02-DATA-RESTRICTED-V02.csv"
year2_final <- read.csv( paste0( fpath, fname ) )
# ALL RACE AND GENDER CATEGORIES
race.vars <-
c("CEOrace_AAPI", "CEOrace_Black", "CEOrace_Hisp", "CEOrace_NativeAm",
"CEOrace_White", "CEOrace_Oth", "CEOrace_Text", "CEOgender_Man",
"CEOgender_Woman", "CEOgender_Trans", "CEOgender_NB", "CEOgender_Oth",
"CEOgender_specify", "BChairrace_AAPI", "BChairrace_Black", "BChairrace_Hisp",
"BChairrace_NativeAm", "BChairrace_White", "BChairrace_Oth",
"BChairrace_Text", "BChairgender_Man", "BChairgender_Woman",
"BChairgender_Trans", "BChairgender_NB", "BChairgender_Oth",
"BChairgender_specify")
# select all race variables plus EIN for merge
df.race2022 <- year2_final[ c("EIN",race.vars) ] dupes2 <- df.race2022 %>% group_by(EIN) %>% filter( n()>1 ) %>% ungroup() 12 Use Year1 Values if No Year2 Response
# drop 2021 values if there are 2022 versions
df.race2021$EIN <- as.numeric( df.race2021$EIN )
df.race2022$EIN <- as.numeric( df.race2022$EIN )
drop <- intersect( df.race2021$EIN, df.race2022$EIN )
# drop %>% length() 531
# nrow( df.race2021 ) 1656
# nrow( df.race2022 ) 691
#
# total nrow: 1656 + 691 - 531 - 17 (dupes) = 1799
df.race2021 <-
df.race2021 %>%
dplyr::filter( ! EIN %in% drop )
df.race2022 <- dplyr::filter( df.race2022, ! is.na(EIN) )
df.race2022 <- dplyr::bind_rows( df.race2022, df.race2021 )
# nrow( df.race2022 ) 179913 Merge 2023 and 2022 Versions
# creates race.vars.y versions:
# CEOrace_AAPI & CEOrace_AAPI.y
survey_df <-
merge( survey_df, df.race2022,
by.x="ExternalReference", by.y="EIN",
all.x=TRUE )
# remove .x from varname.x versions:
nm <- names( survey_df )
nm <- gsub( "\\.x$", "", nm )
names( survey_df ) <- nm
###
### CEO UPDATES
###
# APPLY TO COLUMNS K:
COLUMNS <-
c("CEOrace_AAPI", "CEOrace_Black",
"CEOrace_Hisp", "CEOrace_NativeAm",
"CEOrace_White",
"CEOrace_Oth", "CEOrace_Text",
"CEOgender_Man", "CEOgender_Woman",
"CEOgender_Trans", "CEOgender_NB",
"CEOgender_Oth", "CEOgender_specify" )
# ALL RESPONSES NA IN GROUP:
# indication that the question
# was skipped by the respondent
all.na <- apply( survey_df[ COLUMNS ],
MARGIN = 1,
FUN = function(x) all(is.na(x)) )
CHANGE_2023_X <- ! all.na
# NA means "no" if change occurred
na_to_no <- function(x){
x[ is.na(x) ] <- "No"
return(x)
}
survey_df[ COLUMNS ] <-
survey_df[ COLUMNS ] %>%
lapply( na_to_no )
# if they do not respond "1/yes"
# to at least one question then
# assume they skipped them all
no_to_na <- function( x, rowid ){
x[ rowid ] <- NA
return(x)
}
survey_df[ COLUMNS ] <-
survey_df[ COLUMNS ] %>%
lapply( no_to_na, rowid=all.na )
# update 2022 data with 2023 responses:
# has.change is T/F indicator of any CEO
# change in 2023
impute_race <- function( x.txt, df, has.change ){
x.txt.y <- paste0( x.txt, ".y" )
x1 <- df[[ x.txt ]] # race indicator 2023 1/NA
x2 <- df[[ x.txt.y ]] # race indicator 2022 Yes/No
# update 2022 values with 2023 values
x1[ is.na(x1) ] <- ""
x2[ has.change ] <- x1[ has.change ]
# return updated vector
return( x2 )
}
survey_df[ COLUMNS ] <-
purrr::map( COLUMNS, impute_race, survey_df, CHANGE_2023_X )
###
### BOARD UPDATES
###
# APPLY TO COLUMNS K:
COLUMNS <-
c("BChairrace_AAPI", "BChairrace_Black",
"BChairrace_Hisp", "BChairrace_NativeAm",
"BChairrace_White",
"BChairrace_Oth", "BChairrace_Text",
"BChairgender_Man",
"BChairgender_Woman", "BChairgender_Trans",
"BChairgender_NB",
"BChairgender_Oth", "BChairgender_specify" )
# ALL RESPONSES NA IN GROUP:
# indication that the question
# was skipped by the respondent
all.na <- apply( survey_df[ COLUMNS ],
MARGIN = 1,
FUN = function(x) all(is.na(x)) )
# any change in 2023
CHANGE_2023_X <- ! all.na
# NA means "no"
na_to_no <- function(x){
x[ is.na(x) ] <- "No"
return(x)
}
survey_df[ COLUMNS ] <-
survey_df[ COLUMNS ] %>%
lapply( na_to_no )
# if they do not respond "1/yes"
# to at least one question then
# assume they skipped them all
no_to_na <- function( x, rowid ){
x[ rowid ] <- NA
return(x)
}
survey_df[ COLUMNS ] <-
survey_df[ COLUMNS ] %>%
lapply( no_to_na, rowid=all.na )
# update 2022 data with 2023 responses
impute_race <- function( x.txt, df, has.change ){
x.txt.y <- paste0( x.txt, ".y" )
x1 <- df[[ x.txt ]] # race indicator 2023 1/NA
x2 <- df[[ x.txt.y ]] # race indicator 2022 Yes/No
# update 2022 values with 2023 values
x1[ is.na(x1) ] <- ""
x2[ has.change ] <- x1[ has.change ]
# return updated vector
return( x2 )
}
survey_df[ COLUMNS ] <-
purrr::map( COLUMNS, impute_race, survey_df, CHANGE_2023_X )
# change varname.y to varname.2022
nm <- names( survey_df )
nm <- gsub( "\\.y$", "_v2022", nm )
names( survey_df ) <- nm 14 Create Biracial Categories in Year 3
Create a Biracial indicator variable in the Year 3 data if a CEO or Board Chair belongs to 2 or more racial groups.
# CEOs
ceo.race.qns <- grep( "CEOrace", race_gender_qns_bool, value=T )
# COUNT YES'S
num.races.ceo <-
apply( survey_df[ ceo.race.qns ],
MARGIN = 1,
FUN = function(x){ sum( x %in% c("1","Yes"), na.rm=T ) } )
survey_df$CEOrace_Bi <- ifelse( num.races.ceo > 1, "Yes","No" )
# OMIT ROWS WITH NO RESPONSE
is_missing <- function(x){
# can be NA or ""
x[ is.na(x) ] <- ""
return( x == "" )
}
all.missing <-
apply( survey_df[ ceo.race.qns ],
MARGIN = 1,
FUN = function(x){ all( is_missing(x) ) } )
survey_df$CEOrace_Bi[ all.missing ] <- ""
# BOARD CHAIRs
board.race.qns <- grep( "BChairrace", race_gender_qns_bool, value=T )
# COUNT YES
num.races.chair <-
apply( survey_df[ board.race.qns ],
MARGIN = 1,
FUN = function(x){ sum( x %in% c("1","Yes"), na.rm=T ) } )
survey_df$BChairrace_Bi <- ifelse( num.races.chair > 1, "Yes","No" )
# OMIT ROWS WITH NO RESPONSE
all.missing <-
apply( survey_df[ board.race.qns ],
MARGIN = 1,
FUN = function(x){ all( is_missing(x) ) } )
survey_df$BChairrace_Bi[ all.missing ] <- ""
# MAKE MEMISC ITEM
biracial_qns <- c( "CEOrace_Bi","BChairrace_Bi" )
# Create Survey Item
survey_df <- create_survey_item(
survey_df,
biracial_qns,
recode_vals = c("Yes","No", "" ),
recode_labs = c("Yes","No", "Missing" ),
missing_vals = ""
)Example:
BChairrace_Bi
| Storage mode: | character |
| Measurement: | nominal |
| Missing values: |
| Values and labels | N | Valid | Total | ||||||
| M | ‘Missing’ | 94 | 18 | . | 7 | ||||
| No | ‘No’ | 401 | 98 | . | 3 | 79 | . | 9 | |
| Yes | ‘Yes’ | 7 | 1 | . | 7 | 1 | . | 4 | |
15 Create Combined Race Field
While our race and gender columns are individual binary columns, this is not the case for year 1 and 3 results. Both those years contain single columns for race and gender variables for CEOs and Board Chairs respectively. For easier comparability, we create a new variable aggregating race and gender values for all our individual binary columns.
15.1 Race and Gender Checkboxes
These checkboxes are ticked by the respondent to indicate if their CEO or board chair belong to a specified race or gender identity.
| Original Value | Description | Recode Label | Recode Value | Code as Missing ? |
|---|---|---|---|---|
| Asian/Pacific Islander | Checkbox Checked | Yes | 1 | No |
| Black/African American | Checkbox Checked | Yes | 1 | No |
| Latinx/Hispanic | Checkbox Checked | Yes | 1 | No |
| Native American/American Indian | Checkbox Checked | Yes | 1 | No |
| White | Checkbox Checked | Yes | 1 | No |
| Man | Checkbox Checked | Yes | 1 | No |
| Woman | Checkbox Checked | Yes | 1 | No |
| Trans | Checkbox Checked | Yes | 1 | No |
| Gender non-conforming/Non-Binary | Checkbox Checked | Yes | 1 | No |
| Other (please specify) | Checkbox Checked | Yes | 1 | No |
| 0 | Checkbox Unchecked | No | 0 | No |
| NA | Unanswered | NA | NA | Yes |
# APPLY TO COLUMNS K:
COLUMNS <- race_gender_qns_bool
# VALUES THAT NEED RECODING
RULES <- c( " 1 =>> Yes ",
" Yes =>> Yes ",
" No =>> No ",
" NA =>> _NA_ " )
rules <- parse_rules( RULES )
pattern <- rules[[ "pattern" ]]
replace <- rules[[ "replace" ]]
# MEMISC LABELS AND MISSING VALUE CODES
values <- c( "No", "Yes", "", "_NA_" )
labels <- c( "No", "Yes", "Missing in 2022", "No Response" )
missing <- c( "", "_NA_" )
# RECODE VARIABLES
survey_df <-
survey_df %>%
recode_columns( k=COLUMNS, pattern, replace, values, labels, missing )Example:
CEOrace_White — ‘TYPE: boolean’
“Race of CEO”
| Storage mode: | character |
| Measurement: | nominal |
| Missing values: | , NA |
| Values and labels | N | Valid | Total | ||||||
| M | ‘Missing in 2022’ | 59 | 11 | . | 8 | ||||
| NA | M | ‘No Response’ | 54 | 10 | . | 8 | |||
| No | ‘No’ | 42 | 10 | . | 8 | 8 | . | 4 | |
| Yes | ‘Yes’ | 347 | 89 | . | 2 | 69 | . | 1 | |
- QUESTION TXT:
- Which of the following best describes the race/ethnicity for your organization’s current Chief Executive (i.e., Executive Director or CEO)? (Select all that apply) - Selected Choice - White
15.2 Aggregated Race Variable
Create a single categorical field for Race.
| Original Value | Description | Recode Label | Recode Value | Code as Missing ? |
|---|---|---|---|---|
| ‘Yes’ in CEOrace_AAPI | Asian/Pacific Islander | AAPI | 1 | No |
| ‘Yes’ in CEOrace_Black | Black/African American | Black | 2 | No |
| ‘Yes’ in CEOrace_Hisp | Latinx/Hispanic | Hispanic | 3 | No |
| ‘Yes’ in CEOrace_NativeAm | Native American/American Indian | NativeAm | 4 | No |
| ‘Yes’ in CEOrace_White | White | White | 5 | No |
| ‘Yes’ in CEOrace_Oth | Other (please specify) | Other | 6 | No |
| ‘Yes’ in CEOrace_Bi | Bi/Multi-racial | Biracial | 7 | No |
| No Reponse | No race reported | NA | NA | Yes |
# Create New Race variables
survey_df <- survey_df %>%
dplyr::mutate(
CEOrace = dplyr::case_when(
CEOrace_AAPI == 'Yes' ~ 1,
CEOrace_Black == 'Yes' ~ 2,
CEOrace_Hisp == 'Yes' ~ 3,
CEOrace_NativeAm == 'Yes' ~ 4,
CEOrace_White == 'Yes' ~ 5,
CEOrace_Oth == 'Yes' ~ 6,
CEOrace_Bi == 'Yes' ~ 7,
.default = NA
),
BChairrace = dplyr::case_when(
BChairrace_AAPI == 'Yes' ~ 1,
BChairrace_Black == 'Yes' ~ 2,
BChairrace_Hisp == 'Yes' ~ 3,
BChairrace_NativeAm == 'Yes' ~ 4,
BChairrace_White == 'Yes' ~ 5,
BChairrace_Oth == 'Yes' ~ 6,
BChairrace_Bi == 'Yes' ~ 7,
.default = NA
))
# Manage any coding conflicts:
survey_df$CEOrace[ survey_df$CEOrace_Bi == "Yes" ] <- 7
survey_df$BChairrace[ survey_df$BChairrace_Bi == "Yes" ] <- 7
# APPLY TO COLUMNS K:
race.categories <- c("CEOrace", "BChairrace")
COLUMNS <- race.categories
# VALUES THAT NEED RECODING
RULES <- c( " NA =>> _NA_ " )
rules <- parse_rules( RULES )
pattern <- rules[[ "pattern" ]]
replace <- rules[[ "replace" ]]
# MEMISC LABELS AND MISSING VALUE CODES
values <- c( 1, 2, 3, 4, 5, 6, 7, "_NA_" )
labels <- c( "AAPI", "Black", "Hispanic", "NativeAm",
"White", "Other","Biracial", "No Response" )
missing <- "_NA_"
dd.add <-
data.frame( vname=c("CEOrace", "BChairrace"),
vlabel=c("Race of the CEO","Race of the Board Chair"),
type=c("Categorical","Categorical"),
description=c("Race categories combined into a single factor") )
dd <- dplyr::bind_rows( dd, dd.add )
# RECODE VARIABLES
survey_df <-
survey_df %>%
recode_columns( k=COLUMNS, pattern, replace, values, labels, missing )Example:
CEOrace — ‘TYPE: Categorical’
“Race of the CEO”
| Storage mode: | character |
| Measurement: | nominal |
| Missing values: | NA |
| Values and labels | N | Valid | Total | ||||||
| NA | M | ‘No Response’ | 115 | 22 | . | 9 | |||
| 1 | ‘AAPI’ | 5 | 1 | . | 3 | 1 | . | 0 | |
| 2 | ‘Black’ | 20 | 5 | . | 2 | 4 | . | 0 | |
| 3 | ‘Hispanic’ | 6 | 1 | . | 6 | 1 | . | 2 | |
| 4 | ‘NativeAm’ | 3 | 0 | . | 8 | 0 | . | 6 | |
| 5 | ‘White’ | 344 | 88 | . | 9 | 68 | . | 5 | |
| 6 | ‘Other’ | 6 | 1 | . | 6 | 1 | . | 2 | |
| 7 | ‘Biracial’ | 3 | 0 | . | 8 | 0 | . | 6 | |
- QUESTION TXT:
- Race categories combined into a single factor
16 Create Combined Gender Field
| Original Value | Description | Recode Label | Recode Value | Code as Missing ? |
|---|---|---|---|---|
| ‘Yes’ in CEOgender_Man | Man | Man | 1 | No |
| ‘Yes’ in CEOgender_Woman | Woman | Woman | 2 | No |
| ‘Yes’ in CEOgender_Trans | Trans | Trans | 3 | No |
| ‘Yes’ in CEOgender_NB | Gender non-conforming/Non-Binary | NB | 4 | No |
| ‘Yes’ in CEOgender_Oth | Other (please specify) | Other | 5 | No |
| No response | No gender reported | NA | NA | Yes |
survey_df <- survey_df %>%
dplyr::mutate(
CEOgender = dplyr::case_when(
CEOgender_Man == 'Yes' ~ 1,
CEOgender_Woman == 'Yes' ~ 2,
CEOgender_Trans == 'Yes' ~ 3,
CEOgender_NB == 'Yes' ~ 4,
CEOgender_Oth == 'Yes' ~ 5,
.default = NA
),
BChairgender = dplyr::case_when(
BChairgender_Man == 'Yes' ~ 1,
BChairgender_Woman == 'Yes' ~ 2,
BChairgender_Trans == 'Yes' ~ 3,
BChairgender_NB == 'Yes' ~ 4,
BChairgender_Oth == 'Yes' ~ 5,
.default = NA
))
# APPLY TO COLUMNS K:
COLUMNS <- c("CEOgender", "BChairgender")
# VALUES THAT NEED RECODING
RULES <- c( " NA =>> _NA_ " )
rules <- parse_rules( RULES )
pattern <- rules[[ "pattern" ]]
replace <- rules[[ "replace" ]]
# MEMISC LABELS AND MISSING VALUE CODES
values <- c( 1, 2, 3, 4, 5, "_NA_" )
labels <- c( "Man", "Woman", "Trans", "NB", "Other", "No Response" )
missing <- "_NA_"
dd.add <-
data.frame( vname=c("CEOgender", "BChairgender"),
vlabel=c("Race of the CEO","Race of the Board Chair"),
type=c("Categorical","Categorical"),
description=c("Race categories combined into a single factor") )
dd <- dplyr::bind_rows( dd, dd.add )
# RECODE VARIABLES
survey_df <-
survey_df %>%
recode_columns( k=COLUMNS, pattern, replace, values, labels, missing )Example:
CEOgender — ‘TYPE: Categorical’
“Race of the CEO”
| Storage mode: | character |
| Measurement: | nominal |
| Missing values: | NA |
| Values and labels | N | Valid | Total | ||||||
| NA | M | ‘No Response’ | 113 | 22 | . | 5 | |||
| 1 | ‘Man’ | 67 | 17 | . | 2 | 13 | . | 3 | |
| 2 | ‘Woman’ | 316 | 81 | . | 2 | 62 | . | 9 | |
| 3 | ‘Trans’ | 1 | 0 | . | 3 | 0 | . | 2 | |
| 4 | ‘NB’ | 3 | 0 | . | 8 | 0 | . | 6 | |
| 5 | ‘Other’ | 2 | 0 | . | 5 | 0 | . | 4 | |
- QUESTION TXT:
- Race categories combined into a single factor
16.1 Save outputs
fpath <- "DATA-PREP/03-year-three/03-data-final/"
fname <- "YEAR-03-DATA-RESTRICTED-V05.csv"
write.csv( survey_df, paste0( fpath, fname ), row.names=F, na="" )