library( dplyr )
<- "DATA-PREP/02-year-two/03-data-final/"
fpath <- "YEAR-02-DATA-RESTRICTED-V02.csv"
fname <- read.csv( paste0( fpath, fname ) ) df
7 Preparing Public Use Files Y2
7.1 Load Restricted Data
7.2 Remove Identifying Variables
Remove all variables that:
- identify the organization
- identify individuals
- include long-form responses that might include details the compromise privacy
<-
drop.these c("EIN", "EIN2", "Name_Primary", "Name_Secondary",
"Addr_StreetLocation", "Addr_CityState", "Addr_ZIP",
"ContactInfoUpdate_1", "ContactInfoUpdate_2",
"ContactInfoUpdate_3", "Comments",
"FirstName_Y2_Updated", "LastName_Y2_Updated",
"Title_Y2_Updated", "PrimaryConcern",
"PrgSrvc_OthChng_Text",
"Staff_Other_Text_2021",
"Staff_Other_Text_2022",
"Staff_Other_Est_NA", "FndRaise_Text",
"Finance_Rev_Oth_Text","FinanceChng_Text",
"LeadershipChng_Oth_Text",
"CEOrace_Text","BChairrace_Text" )
<-
keep c("ResponseID","year2wt","PrgSrvc_IncrNum",
"PrgSrvc_DcrsNum", "PrgSrvc_Suspend", "PrgSrvc_IncrSrvc", "PrgSrvc_DcrsSrvc",
"PrgSrvc_NewOffc", "PrgSrvc_ClsdOffc", "PrgSrvc_IncrArea", "PrgSrvc_DcrsArea",
"PrgSrvc_IncrFee", "PrgSrvc_DcrsFee", "PrgSrvc_ShiftOnline",
"PrgSrvc_AddOnline", "PrgSrvc_Oth", "PplSrv_NumServed",
"PplSrv_NumWait", "PplSrv_NumServed_NA_X", "PplSrv_NumWait_NA_X",
"Dmnd_NxtYear", "Staff_Fulltime_2021", "Staff_Parttime_2021",
"Staff_Boardmmbr_2021", "Staff_RegVlntr_2021", "Staff_EpsdVltnr_2021",
"Staff_AmerVlntr_2021", "Staff_PdCnslt_2021",
"Staff_Other_Est_2021", "Staff_Fulltime_2022", "Staff_Parttime_2022",
"Staff_Boardmmbr_2022", "Staff_RegVlntr_2022", "Staff_EpsdVltnr_2022",
"Staff_AmerVlntr_2022", "Staff_PdCnslt_2022",
"Staff_Other_Est_2022", "Staff_Fulltime_NA", "Staff_Parttime_NA",
"Staff_Boardmmbr_NA", "Staff_RegVlntr_NA", "Staff_EpsdVltnr_NA",
"Staff_AmerVlntr_NA", "Staff_PdCnslt_NA",
"Staff_Other_Text_NA", "VolImportance", "DonImportance", "FndRaise_DvlpVirtual",
"FndRaise_IncrExp", "FndRaise_DcrsExp", "FndRaise_IncStaff",
"FndRaise_DcrsStaff", "FndRaise_IncrCnslt", "FndRaise_DcrsCnslt",
"FndRaise_IncrVolntr", "FndRaise_DcrsVolntr", "FndRaise_Othr",
"FndRaise_MajGift_Amt", "FndRaise_Overall_Chng",
"FndRaise_Cashbelow250_Chng", "FndRaise_Cashabove250_Chng", "FndRaise_MajGift_Chng",
"FndRaise_Corp.Found_Grnt_Chng", "FndRaise_RstrGift_Chng", "FndRaise_UnrstrGift_Chng",
"FndRaise_DnrBlw250", "FndRaise_DnrAbv250", "FndRaise_LocGvtGrnt_Seek",
"FndRaise_StateGvtGrnt_Seek", "FndRaise_FedGvtGrnt_Seek", "FndRaise_LocGvtCntrct_Seek",
"FndRaise_StateGvtCntrct_Seek", "FndRaise_FedGvtCntrct_Seek",
"FndRaise_PFGrnt_Seek", "FndRaise_CFGrnt_Seek", "FndRaise_DAF_Seek",
"FndRaise_Corp.Found_Grnt_Seek", "FndRaise_UntdWy_Seek", "FndRaise_CombFedCmpgn_Seek",
"FndRaise_OthrGvngPrgrm_Seek", "FndRaise_LocGvtGrnt_Rcv", "FndRaise_StateGvtGrnt_Rcv",
"FndRaise_FedGvtGrnt_Rcv", "FndRaise_LocGvtCntrct_Rcv", "FndRaise_StateGvtCntrct_Rcv",
"FndRaise_FedGvtCntrct_Rcv", "FndRaise_PFGrnt_Rcv", "FndRaise_CFGrnt_Rcv",
"FndRaise_DAF_Rcv", "FndRaise_Corp.Found_Grnt_Rcv", "FndRaise_UntdWy_Rcv",
"FndRaise_CombFedCmpgn_Rcv", "FndRaise_OthrGvngPrgrm_Rcv", "Finance_Rev_GovtMain",
"Finance_Rev_Prtcpnt", "Finance_Rev_Govt3rdParty", "Finance_Rev_IndvDon",
"Finance_Rev_Gift", "Finance_Rev_Grnt", "Finance_Rev_Spnsr",
"Finance_Rev_Oth", "Reserves_Est", "Reserves_NA_X",
"CARES_Rcv", "CARES_Rcv_Est", "FinanceChng_Reserves", "FinanceChng_Borrow",
"FinanceChng_DcrsBnft", "FinanceChng_IncrBnft", "FinanceChng_IncrExp",
"FinanceChng_DcrsExp", "FinanceChng_Oth",
"LeadershipChng_RetCEO", "LeadershipChng_RsgnCEO", "LeadershipChng_TrmnCEO",
"LeadershipChng_HireCEO", "LeadershipChng_IntrmCEO", "LeadershipChng_ChngBC",
"LeadershipChng_LostBoardMem", "LeadershipChng_RplcBoardMem",
"LeadershipChng_AddBoardMem", "LeadershipChng_Oth",
"CEOrace_AAPI", "CEOrace_Black", "CEOrace_Hisp", "CEOrace_NativeAm",
"CEOrace_White", "CEOrace_Oth", "CEOgender_Man",
"CEOgender_Woman", "CEOgender_Trans", "CEOgender_NB", "CEOgender_Oth",
"CEOgender_specify", "BChairrace_AAPI", "BChairrace_Black", "BChairrace_Hisp",
"BChairrace_NativeAm", "BChairrace_White", "BChairrace_Oth",
"BChairgender_Man", "BChairgender_Woman",
"BChairgender_Trans", "BChairgender_NB", "BChairgender_Oth",
"BChairgender_specify", "ExtAffairs_GenEd", "ExtAffairs_Media",
"ExtAffairs_Advocacy", "ExtAffairs_GovtRs", "ExtAffairs_DiscGovtGrnt",
"ExtAffairs_InfoReq", "ExtAffairs_Testify", "ExtAffairs_Lobby",
"ExtAffairs_OrgPrtst", "ExtAffairs_Mobilize", "ExtAffairs_Petition",
"ExtAffairs_VoteReg", "ExtAffairs_VoteEd",
"CEOrace", "CEOrace_Bi", "BChairrace", "BChairrace_Bi",
"CEOgender", "BChairgender" )
<- df[keep] df
7.3 Add Noise to Numeric Responses
Add noise to numeric responses that could possibly be compared with public records to identify the organization.
<-
financials c( "Staff_Fulltime_2021", "Staff_Parttime_2021", "Staff_Boardmmbr_2021",
"Staff_RegVlntr_2021", "Staff_EpsdVltnr_2021", "Staff_AmerVlntr_2021",
"Staff_PdCnslt_2021", "Staff_Other_Est_2021",
"Staff_Fulltime_2022", "Staff_Parttime_2022", "Staff_Boardmmbr_2022",
"Staff_RegVlntr_2022", "Staff_EpsdVltnr_2022", "Staff_AmerVlntr_2022",
"Staff_PdCnslt_2022", "Staff_Other_Est_2022",
"FndRaise_MajGift_Amt", "FndRaise_DnrBlw250", "FndRaise_DnrAbv250",
"Finance_Rev_GovtMain", "Finance_Rev_Prtcpnt",
"Finance_Rev_Govt3rdParty", "Finance_Rev_IndvDon", "Finance_Rev_Gift",
"Finance_Rev_Grnt", "Finance_Rev_Spnsr", "Finance_Rev_Oth",
"Reserves_Est", "CARES_Rcv_Est" )
<- function(x){
add_noise if( is.na(x) | is.nan(x) ){ return(NA) }
<- x/20 # standard dev of 5% of base
v <- rnorm( n=1, mean=x, sd=v ) |> round(0)
x return(x)
}
add_noise( 3000 )
[1] 2925
<- function(x){
make_noise <- purrr::map_int( x, add_noise )
x return(x)
}
<- df # make a copy for comparison
df2
<- lapply( df[financials], make_noise )
df[financials]
<- quantile( df$Finance_Rev_IndvDon, 0.9, na.rm=T )
max
plot( df$Finance_Rev_IndvDon, df2$Finance_Rev_IndvDon,
bty="n", pch=19, col=gray(0.5,0.5), cex=1.5,
xlim=c(0,max), ylim=c(0,max),
main="Inspect Noise" )
<- quantile( df$Staff_Fulltime_2021, 0.9, na.rm=T )
max
plot( df$Staff_Fulltime_2021, df2$Staff_Fulltime_2021,
bty="n", pch=19, col=gray(0.5,0.5), cex=1.5,
xlim=c(0,max), ylim=c(0,max),
main="Inspect Noise" )
7.4 Save to File
<- "DATA-PREP/02-year-two/03-data-final/"
fpath <- "YEAR-02-DATA-PUF.csv"
fname write.csv( df, paste0( fpath, fname ), row.names=FALSE, na="" )