<- "DATA-PREP/01-year-one/02-data-intermediate/"
fpath <- "wave-01-data-intermediate.csv"
fname <- readr::read_csv( paste0( fpath, fname ) ) year1_raw
9 Reconcile Demographic Variables
10 Get Year1 Race Variables
# Wrangle Data for Year 1
# Responses on CEO race and Gender
<- year1_raw %>%
year1_ceo ::select(EIN, CEOrace, CEOgender ) %>%
dplyr::filter( ! CEOrace %in% c(-99, NA),
dplyr! CEOgender %in% c(-99, NA) ) %>%
::mutate(
dplyrCEOrace = dplyr::case_match(
CEOrace,1 ~ "AAPI", 2 ~ "Black", 3 ~ "Hisp", 4 ~ "NativeAm", 5 ~ "White", 6 ~ "Bi", 7 ~ "Oth",
.default = "Oth"
),CEOgender = dplyr::case_match(
CEOgender,1 ~ "Man", 2 ~ "Woman", 3 ~ "Trans", 4 ~ "NB", 5 ~ "Oth",
.default = "Oth"
),race_check = "Yes",
gender_check = "Yes"
%>%
) ::pivot_wider(
tidyrnames_from = CEOrace,
names_glue = "CEOrace_{CEOrace}",
values_from = race_check,
values_fill = "No"
%>%
) ::pivot_wider(
tidyrnames_from = CEOgender,
names_glue = "CEOgender_{CEOgender}",
values_from = gender_check,
values_fill = "No"
)
# Wrangle Data for Year 1 Responses on Board Chair Race and Gender
<- year1_raw %>%
year1_board ::select(EIN, BCrace, BCgender) %>%
dplyr::filter( ! BCrace %in% c(-99, NA),
dplyr! BCgender %in% c(-99, NA)) %>%
::mutate(
dplyrBCrace = dplyr::case_match(
BCrace,1 ~ "AAPI", 2 ~ "Black", 3 ~ "Hisp", 4 ~ "NativeAm", 5 ~ "White", 6 ~ "Bi", 7 ~ "Oth",
.default = "Oth"
),BCgender = dplyr::case_match(
BCgender,1 ~ "Man", 2 ~ "Woman", 3 ~ "Trans", 4 ~ "NB", 5 ~ "Oth",
.default = "Oth"
),race_check = "Yes",
gender_check = "Yes"
%>%
) ::pivot_wider(
tidyrnames_from = BCrace,
names_glue = "BChairrace_{BCrace}",
values_from = race_check,
values_fill = "No"
%>%
) ::pivot_wider(
tidyrnames_from = BCgender,
names_glue = "BChairgender_{BCgender}",
values_from = gender_check,
values_fill = "No"
)
<- merge( year1_ceo, year1_board, by="EIN", all=TRUE ) df.race2021
11 Get Year2 Race & Gender Variables
<- "DATA-PREP/02-year-two/03-data-final/"
fpath <- "YEAR-02-DATA-RESTRICTED-V02.csv"
fname <- read.csv( paste0( fpath, fname ) )
year2_final
# ALL RACE AND GENDER CATEGORIES
<-
race.vars c("CEOrace_AAPI", "CEOrace_Black", "CEOrace_Hisp", "CEOrace_NativeAm",
"CEOrace_White", "CEOrace_Oth", "CEOrace_Text", "CEOgender_Man",
"CEOgender_Woman", "CEOgender_Trans", "CEOgender_NB", "CEOgender_Oth",
"CEOgender_specify", "BChairrace_AAPI", "BChairrace_Black", "BChairrace_Hisp",
"BChairrace_NativeAm", "BChairrace_White", "BChairrace_Oth",
"BChairrace_Text", "BChairgender_Man", "BChairgender_Woman",
"BChairgender_Trans", "BChairgender_NB", "BChairgender_Oth",
"BChairgender_specify")
# select all race variables plus EIN for merge
<- year2_final[ c("EIN",race.vars) ] df.race2022
<- df.race2022 %>% group_by(EIN) %>% filter( n()>1 ) %>% ungroup() dupes2
12 Use Year1 Values if No Year2 Response
# drop 2021 values if there are 2022 versions
$EIN <- as.numeric( df.race2021$EIN )
df.race2021$EIN <- as.numeric( df.race2022$EIN )
df.race2022<- intersect( df.race2021$EIN, df.race2022$EIN )
drop
# drop %>% length() 531
# nrow( df.race2021 ) 1656
# nrow( df.race2022 ) 691
#
# total nrow: 1656 + 691 - 531 - 17 (dupes) = 1799
<-
df.race2021 %>%
df.race2021 ::filter( ! EIN %in% drop )
dplyr
<- dplyr::filter( df.race2022, ! is.na(EIN) )
df.race2022 <- dplyr::bind_rows( df.race2022, df.race2021 )
df.race2022
# nrow( df.race2022 ) 1799
13 Merge 2023 and 2022 Versions
# creates race.vars.y versions:
# CEOrace_AAPI & CEOrace_AAPI.y
<-
survey_df merge( survey_df, df.race2022,
by.x="ExternalReference", by.y="EIN",
all.x=TRUE )
# remove .x from varname.x versions:
<- names( survey_df )
nm <- gsub( "\\.x$", "", nm )
nm names( survey_df ) <- nm
###
### CEO UPDATES
###
# APPLY TO COLUMNS K:
<-
COLUMNS c("CEOrace_AAPI", "CEOrace_Black",
"CEOrace_Hisp", "CEOrace_NativeAm",
"CEOrace_White",
"CEOrace_Oth", "CEOrace_Text",
"CEOgender_Man", "CEOgender_Woman",
"CEOgender_Trans", "CEOgender_NB",
"CEOgender_Oth", "CEOgender_specify" )
# ALL RESPONSES NA IN GROUP:
# indication that the question
# was skipped by the respondent
<- apply( survey_df[ COLUMNS ],
all.na MARGIN = 1,
FUN = function(x) all(is.na(x)) )
<- ! all.na
CHANGE_2023_X
# NA means "no" if change occurred
<- function(x){
na_to_no is.na(x) ] <- "No"
x[ return(x)
}
<-
survey_df[ COLUMNS ] %>%
survey_df[ COLUMNS ] lapply( na_to_no )
# if they do not respond "1/yes"
# to at least one question then
# assume they skipped them all
<- function( x, rowid ){
no_to_na <- NA
x[ rowid ] return(x)
}
<-
survey_df[ COLUMNS ] %>%
survey_df[ COLUMNS ] lapply( no_to_na, rowid=all.na )
# update 2022 data with 2023 responses:
# has.change is T/F indicator of any CEO
# change in 2023
<- function( x.txt, df, has.change ){
impute_race <- paste0( x.txt, ".y" )
x.txt.y
<- df[[ x.txt ]] # race indicator 2023 1/NA
x1 <- df[[ x.txt.y ]] # race indicator 2022 Yes/No
x2
# update 2022 values with 2023 values
is.na(x1) ] <- ""
x1[ <- x1[ has.change ]
x2[ has.change ]
# return updated vector
return( x2 )
}
<-
survey_df[ COLUMNS ] ::map( COLUMNS, impute_race, survey_df, CHANGE_2023_X )
purrr
###
### BOARD UPDATES
###
# APPLY TO COLUMNS K:
<-
COLUMNS c("BChairrace_AAPI", "BChairrace_Black",
"BChairrace_Hisp", "BChairrace_NativeAm",
"BChairrace_White",
"BChairrace_Oth", "BChairrace_Text",
"BChairgender_Man",
"BChairgender_Woman", "BChairgender_Trans",
"BChairgender_NB",
"BChairgender_Oth", "BChairgender_specify" )
# ALL RESPONSES NA IN GROUP:
# indication that the question
# was skipped by the respondent
<- apply( survey_df[ COLUMNS ],
all.na MARGIN = 1,
FUN = function(x) all(is.na(x)) )
# any change in 2023
<- ! all.na
CHANGE_2023_X
# NA means "no"
<- function(x){
na_to_no is.na(x) ] <- "No"
x[ return(x)
}
<-
survey_df[ COLUMNS ] %>%
survey_df[ COLUMNS ] lapply( na_to_no )
# if they do not respond "1/yes"
# to at least one question then
# assume they skipped them all
<- function( x, rowid ){
no_to_na <- NA
x[ rowid ] return(x)
}
<-
survey_df[ COLUMNS ] %>%
survey_df[ COLUMNS ] lapply( no_to_na, rowid=all.na )
# update 2022 data with 2023 responses
<- function( x.txt, df, has.change ){
impute_race <- paste0( x.txt, ".y" )
x.txt.y
<- df[[ x.txt ]] # race indicator 2023 1/NA
x1 <- df[[ x.txt.y ]] # race indicator 2022 Yes/No
x2
# update 2022 values with 2023 values
is.na(x1) ] <- ""
x1[ <- x1[ has.change ]
x2[ has.change ]
# return updated vector
return( x2 )
}
<-
survey_df[ COLUMNS ] ::map( COLUMNS, impute_race, survey_df, CHANGE_2023_X )
purrr
# change varname.y to varname.2022
<- names( survey_df )
nm <- gsub( "\\.y$", "_v2022", nm )
nm names( survey_df ) <- nm
14 Create Biracial Categories in Year 3
Create a Biracial indicator variable in the Year 3 data if a CEO or Board Chair belongs to 2 or more racial groups.
# CEOs
<- grep( "CEOrace", race_gender_qns_bool, value=T )
ceo.race.qns
# COUNT YES'S
<-
num.races.ceo apply( survey_df[ ceo.race.qns ],
MARGIN = 1,
FUN = function(x){ sum( x %in% c("1","Yes"), na.rm=T ) } )
$CEOrace_Bi <- ifelse( num.races.ceo > 1, "Yes","No" )
survey_df
# OMIT ROWS WITH NO RESPONSE
<- function(x){
is_missing # can be NA or ""
is.na(x) ] <- ""
x[ return( x == "" )
}
<-
all.missing apply( survey_df[ ceo.race.qns ],
MARGIN = 1,
FUN = function(x){ all( is_missing(x) ) } )
$CEOrace_Bi[ all.missing ] <- ""
survey_df
# BOARD CHAIRs
<- grep( "BChairrace", race_gender_qns_bool, value=T )
board.race.qns
# COUNT YES
<-
num.races.chair apply( survey_df[ board.race.qns ],
MARGIN = 1,
FUN = function(x){ sum( x %in% c("1","Yes"), na.rm=T ) } )
$BChairrace_Bi <- ifelse( num.races.chair > 1, "Yes","No" )
survey_df
# OMIT ROWS WITH NO RESPONSE
<-
all.missing apply( survey_df[ board.race.qns ],
MARGIN = 1,
FUN = function(x){ all( is_missing(x) ) } )
$BChairrace_Bi[ all.missing ] <- ""
survey_df
# MAKE MEMISC ITEM
<- c( "CEOrace_Bi","BChairrace_Bi" )
biracial_qns
# Create Survey Item
<- create_survey_item(
survey_df
survey_df,
biracial_qns, recode_vals = c("Yes","No", "" ),
recode_labs = c("Yes","No", "Missing" ),
missing_vals = ""
)
Example:
BChairrace_Bi
Storage mode: | character |
Measurement: | nominal |
Missing values: |
Values and labels | N | Valid | Total | ||||||
M | ‘Missing’ | 94 | 18 | . | 7 | ||||
No | ‘No’ | 401 | 98 | . | 3 | 79 | . | 9 | |
Yes | ‘Yes’ | 7 | 1 | . | 7 | 1 | . | 4 |
15 Create Combined Race Field
While our race and gender columns are individual binary columns, this is not the case for year 1 and 3 results. Both those years contain single columns for race and gender variables for CEOs and Board Chairs respectively. For easier comparability, we create a new variable aggregating race and gender values for all our individual binary columns.
15.1 Race and Gender Checkboxes
These checkboxes are ticked by the respondent to indicate if their CEO or board chair belong to a specified race or gender identity.
Original Value | Description | Recode Label | Recode Value | Code as Missing ? |
---|---|---|---|---|
Asian/Pacific Islander | Checkbox Checked | Yes | 1 | No |
Black/African American | Checkbox Checked | Yes | 1 | No |
Latinx/Hispanic | Checkbox Checked | Yes | 1 | No |
Native American/American Indian | Checkbox Checked | Yes | 1 | No |
White | Checkbox Checked | Yes | 1 | No |
Man | Checkbox Checked | Yes | 1 | No |
Woman | Checkbox Checked | Yes | 1 | No |
Trans | Checkbox Checked | Yes | 1 | No |
Gender non-conforming/Non-Binary | Checkbox Checked | Yes | 1 | No |
Other (please specify) | Checkbox Checked | Yes | 1 | No |
0 | Checkbox Unchecked | No | 0 | No |
NA | Unanswered | NA | NA | Yes |
# APPLY TO COLUMNS K:
<- race_gender_qns_bool
COLUMNS
# VALUES THAT NEED RECODING
<- c( " 1 =>> Yes ",
RULES " Yes =>> Yes ",
" No =>> No ",
" NA =>> _NA_ " )
<- parse_rules( RULES )
rules <- rules[[ "pattern" ]]
pattern <- rules[[ "replace" ]]
replace
# MEMISC LABELS AND MISSING VALUE CODES
<- c( "No", "Yes", "", "_NA_" )
values <- c( "No", "Yes", "Missing in 2022", "No Response" )
labels <- c( "", "_NA_" )
missing
# RECODE VARIABLES
<-
survey_df %>%
survey_df recode_columns( k=COLUMNS, pattern, replace, values, labels, missing )
Example:
CEOrace_White
— ‘TYPE: boolean’
“Race of CEO”
Storage mode: | character |
Measurement: | nominal |
Missing values: | , NA |
Values and labels | N | Valid | Total | ||||||
M | ‘Missing in 2022’ | 59 | 11 | . | 8 | ||||
NA | M | ‘No Response’ | 54 | 10 | . | 8 | |||
No | ‘No’ | 42 | 10 | . | 8 | 8 | . | 4 | |
Yes | ‘Yes’ | 347 | 89 | . | 2 | 69 | . | 1 |
- QUESTION TXT:
- Which of the following best describes the race/ethnicity for your organization’s current Chief Executive (i.e., Executive Director or CEO)? (Select all that apply) - Selected Choice - White
15.2 Aggregated Race Variable
Create a single categorical field for Race.
Original Value | Description | Recode Label | Recode Value | Code as Missing ? |
---|---|---|---|---|
‘Yes’ in CEOrace_AAPI | Asian/Pacific Islander | AAPI | 1 | No |
‘Yes’ in CEOrace_Black | Black/African American | Black | 2 | No |
‘Yes’ in CEOrace_Hisp | Latinx/Hispanic | Hispanic | 3 | No |
‘Yes’ in CEOrace_NativeAm | Native American/American Indian | NativeAm | 4 | No |
‘Yes’ in CEOrace_White | White | White | 5 | No |
‘Yes’ in CEOrace_Oth | Other (please specify) | Other | 6 | No |
‘Yes’ in CEOrace_Bi | Bi/Multi-racial | Biracial | 7 | No |
No Reponse | No race reported | NA | NA | Yes |
# Create New Race variables
<- survey_df %>%
survey_df ::mutate(
dplyrCEOrace = dplyr::case_when(
== 'Yes' ~ 1,
CEOrace_AAPI == 'Yes' ~ 2,
CEOrace_Black == 'Yes' ~ 3,
CEOrace_Hisp == 'Yes' ~ 4,
CEOrace_NativeAm == 'Yes' ~ 5,
CEOrace_White == 'Yes' ~ 6,
CEOrace_Oth == 'Yes' ~ 7,
CEOrace_Bi .default = NA
),BChairrace = dplyr::case_when(
== 'Yes' ~ 1,
BChairrace_AAPI == 'Yes' ~ 2,
BChairrace_Black == 'Yes' ~ 3,
BChairrace_Hisp == 'Yes' ~ 4,
BChairrace_NativeAm == 'Yes' ~ 5,
BChairrace_White == 'Yes' ~ 6,
BChairrace_Oth == 'Yes' ~ 7,
BChairrace_Bi .default = NA
))
# Manage any coding conflicts:
$CEOrace[ survey_df$CEOrace_Bi == "Yes" ] <- 7
survey_df$BChairrace[ survey_df$BChairrace_Bi == "Yes" ] <- 7
survey_df
# APPLY TO COLUMNS K:
<- c("CEOrace", "BChairrace")
race.categories <- race.categories
COLUMNS
# VALUES THAT NEED RECODING
<- c( " NA =>> _NA_ " )
RULES
<- parse_rules( RULES )
rules <- rules[[ "pattern" ]]
pattern <- rules[[ "replace" ]]
replace
# MEMISC LABELS AND MISSING VALUE CODES
<- c( 1, 2, 3, 4, 5, 6, 7, "_NA_" )
values <- c( "AAPI", "Black", "Hispanic", "NativeAm",
labels "White", "Other","Biracial", "No Response" )
<- "_NA_"
missing
<-
dd.add data.frame( vname=c("CEOrace", "BChairrace"),
vlabel=c("Race of the CEO","Race of the Board Chair"),
type=c("Categorical","Categorical"),
description=c("Race categories combined into a single factor") )
<- dplyr::bind_rows( dd, dd.add )
dd
# RECODE VARIABLES
<-
survey_df %>%
survey_df recode_columns( k=COLUMNS, pattern, replace, values, labels, missing )
Example:
CEOrace
— ‘TYPE: Categorical’
“Race of the CEO”
Storage mode: | character |
Measurement: | nominal |
Missing values: | NA |
Values and labels | N | Valid | Total | ||||||
NA | M | ‘No Response’ | 115 | 22 | . | 9 | |||
1 | ‘AAPI’ | 5 | 1 | . | 3 | 1 | . | 0 | |
2 | ‘Black’ | 20 | 5 | . | 2 | 4 | . | 0 | |
3 | ‘Hispanic’ | 6 | 1 | . | 6 | 1 | . | 2 | |
4 | ‘NativeAm’ | 3 | 0 | . | 8 | 0 | . | 6 | |
5 | ‘White’ | 344 | 88 | . | 9 | 68 | . | 5 | |
6 | ‘Other’ | 6 | 1 | . | 6 | 1 | . | 2 | |
7 | ‘Biracial’ | 3 | 0 | . | 8 | 0 | . | 6 |
- QUESTION TXT:
- Race categories combined into a single factor
16 Create Combined Gender Field
Original Value | Description | Recode Label | Recode Value | Code as Missing ? |
---|---|---|---|---|
‘Yes’ in CEOgender_Man | Man | Man | 1 | No |
‘Yes’ in CEOgender_Woman | Woman | Woman | 2 | No |
‘Yes’ in CEOgender_Trans | Trans | Trans | 3 | No |
‘Yes’ in CEOgender_NB | Gender non-conforming/Non-Binary | NB | 4 | No |
‘Yes’ in CEOgender_Oth | Other (please specify) | Other | 5 | No |
No response | No gender reported | NA | NA | Yes |
<- survey_df %>%
survey_df ::mutate(
dplyrCEOgender = dplyr::case_when(
== 'Yes' ~ 1,
CEOgender_Man == 'Yes' ~ 2,
CEOgender_Woman == 'Yes' ~ 3,
CEOgender_Trans == 'Yes' ~ 4,
CEOgender_NB == 'Yes' ~ 5,
CEOgender_Oth .default = NA
),BChairgender = dplyr::case_when(
== 'Yes' ~ 1,
BChairgender_Man == 'Yes' ~ 2,
BChairgender_Woman == 'Yes' ~ 3,
BChairgender_Trans == 'Yes' ~ 4,
BChairgender_NB == 'Yes' ~ 5,
BChairgender_Oth .default = NA
))
# APPLY TO COLUMNS K:
<- c("CEOgender", "BChairgender")
COLUMNS
# VALUES THAT NEED RECODING
<- c( " NA =>> _NA_ " )
RULES <- parse_rules( RULES )
rules <- rules[[ "pattern" ]]
pattern <- rules[[ "replace" ]]
replace
# MEMISC LABELS AND MISSING VALUE CODES
<- c( 1, 2, 3, 4, 5, "_NA_" )
values <- c( "Man", "Woman", "Trans", "NB", "Other", "No Response" )
labels <- "_NA_"
missing
<-
dd.add data.frame( vname=c("CEOgender", "BChairgender"),
vlabel=c("Race of the CEO","Race of the Board Chair"),
type=c("Categorical","Categorical"),
description=c("Race categories combined into a single factor") )
<- dplyr::bind_rows( dd, dd.add )
dd
# RECODE VARIABLES
<-
survey_df %>%
survey_df recode_columns( k=COLUMNS, pattern, replace, values, labels, missing )
Example:
CEOgender
— ‘TYPE: Categorical’
“Race of the CEO”
Storage mode: | character |
Measurement: | nominal |
Missing values: | NA |
Values and labels | N | Valid | Total | ||||||
NA | M | ‘No Response’ | 113 | 22 | . | 5 | |||
1 | ‘Man’ | 67 | 17 | . | 2 | 13 | . | 3 | |
2 | ‘Woman’ | 316 | 81 | . | 2 | 62 | . | 9 | |
3 | ‘Trans’ | 1 | 0 | . | 3 | 0 | . | 2 | |
4 | ‘NB’ | 3 | 0 | . | 8 | 0 | . | 6 | |
5 | ‘Other’ | 2 | 0 | . | 5 | 0 | . | 4 |
- QUESTION TXT:
- Race categories combined into a single factor
16.1 Save outputs
<- "DATA-PREP/03-year-three/03-data-final/"
fpath <- "YEAR-03-DATA-RESTRICTED-V05.csv"
fname write.csv( survey_df, paste0( fpath, fname ), row.names=F, na="" )