Preprocessing

library(Hmisc)
library(tidyverse)
library(ipumsr)
library(reticulate)
library(janitor)
library(gt)
library(DT)

theme_set(theme_minimal())
theme_update(plot.title.position = "plot")

options(scipen = 999)

This codes loads the data from IPUMS CPS.

ddi <- read_ipums_ddi(here::here("data", "cps_00008.xml"))
data <- read_ipums_micro(ddi)
## Use of data from IPUMS CPS is subject to conditions including that users should
## cite the data appropriately. Use command `ipums_conditions()` for more details.

Counts

No Noise

We reduce our sample to just heads of households and spouses.

asec <- data %>%
  filter(ASECFLAG == 1) %>%
  filter(RELATE %in% c("101", "201"))

We count the unweighted and weighted observations.

# unweighted observations
nrow(asec)
## [1] 112502
# weighted observations
sum(asec$ASECWT)
## [1] 180129012

Histograms

No Noise – Mortenson Figure 1

The Mortenson paper does not work with the CPS data because the data are rounded and imprecise. Still, we calculate a histogram for demonstration using five years of pooled CPS ASEC data.

ddi <- read_ipums_ddi(here::here("data", "cps_00009.xml"))
cps_mort <- read_ipums_micro(ddi)
## Use of data from IPUMS CPS is subject to conditions including that users should
## cite the data appropriately. Use command `ipums_conditions()` for more details.
  1. Load the CPS ASEC
  2. Filter to where NCHILD == 2
  3. Filter to where FILESTAT %in% c(4, 5)
  4. Filter to drop dependents
  5. Sum INCWAGE, INCBUS, and INCFARM to create earned income.
  6. Inflation adjust all years into 2014 dollars
count(cps_mort, NCHILD)
## # A tibble: 10 x 2
##                    NCHILD      n
##                 <int+lbl>  <int>
##  1 0 [0 children present] 681330
##  2 1 [1 child present]    139496
##  3 2                      124260
##  4 3                       50818
##  5 4                       15810
##  6 5                        4494
##  7 6                        1406
##  8 7                         470
##  9 8                         155
## 10 9 [9+]                    134
eitc <- cps_mort %>%
  # limit to households with two children
  filter(NCHILD == 2) %>%
  # limit to single and head of household filers
  filter(FILESTAT %in% c(4, 5)) %>%
  # drop dependents
  filter(DEPSTAT == 0)

# calculate earned income
eitc <- eitc %>%
  mutate(
    earned_income = 
      INCWAGE +
      INCBUS +
      INCFARM
  )

# inflation adjust to 2014 dollars
eitc <- eitc %>%
  mutate(
    earned_income_2014 =
      case_when(
        YEAR == 2010 ~ earned_income * 236.715 / 218.076166666666,
        YEAR == 2011 ~ earned_income * 236.715 / 224.923,
        YEAR == 2012 ~ earned_income * 236.715 / 229.586083333333, 
        YEAR == 2013 ~ earned_income * 236.715 / 232.95175,
        YEAR == 2014 ~ earned_income
      )
  ) %>%
  select(earned_income, earned_income_2014)

eitc %>%
  filter(
    earned_income_2014 > 0,
    earned_income_2014 <= 30000
  ) %>%
  ggplot(aes(earned_income_2014)) +
  geom_histogram(binwidth = 1000) +
  scale_x_continuous(labels = scales::dollar) +
  labs(title = "Earned Income for Single/HOH Filers with 2 Child Dependents")

source(here::here("R", "prep_cps03.R"))

mortenson_cps <- prep_cps03()
## Use of data from IPUMS CPS is subject to conditions including that users should
## cite the data appropriately. Use command `ipums_conditions()` for more details.
# values for specifications
breaks <- seq(from = 0, to = 30000, by = 1000)

# no noise ----------------------------------------------------------------
true_histogram <- hist(mortenson_cps$earned_income_2014, breaks = breaks, plot = FALSE)$counts

Noisy Results

This table summarizes the results across all methods for histograms. Detailed data visualizations are in subsequent sections.

histogram_results <- read_csv(here::here("results", "03_cps_histograms.csv"), guess_max = 36000)

histogram_results <- histogram_results %>%
  mutate(
    relative_bias = bias /  mean(true_histogram)
  )

probs <- seq(0.1, 0.9, 0.1)

histogram_results_summary <- histogram_results %>%
  group_by(method, epsilon, delta) %>%
  summarize(
    q = probs, 
    relative_bias = quantile(relative_bias, probs = probs),
    bias = quantile(bias, probs = probs),
    rmse = quantile(rmse, probs = probs)
  ) %>%
  ungroup()

histogram_results_summary %>%
  filter(q == 0.5) %>%
  datatable()
histogram_results_summary %>%
  filter(epsilon > 0.01) %>%
  filter(q == 0.5,
         delta %in% c(NA, 0.000001)) %>%
  ggplot(aes(epsilon, relative_bias)) +
  geom_line(aes(group = method), alpha = 0.2) +
  geom_point(aes(color = method), alpha = 0.4) +
  facet_wrap(~method, scales = "free_y") +
  labs(
    title = "Median Relative Bias for Income Histogram"
  )

histogram_results_summary %>%
  filter(epsilon > 0.01) %>%
  filter(delta %in% c(NA, 0.000001),
         q %in% c(0.1, 0.5, 0.9)) %>%
  ggplot(aes(factor(epsilon), relative_bias)) +
  geom_line(aes(group = factor(q)), alpha = 0.2) +
  geom_point(aes(color = factor(q)), alpha = 0.4) +
  facet_wrap(~method, scales = "free_y") +
  labs(
    title = "Relative Bias for Income Histogram",
    subtitle = "Selected percentiles"
  )

histogram_results_summary %>%
  filter(epsilon > 0.01) %>%
  filter(q == 0.5,
         delta %in% c(NA, 0.000001)) %>%
  ggplot(aes(epsilon, bias)) +
  geom_line(aes(group = method), alpha = 0.2) +
  geom_point(aes(color = method), alpha = 0.4) +
  facet_wrap(~method, scales = "free_y") +
  labs(
    title = "Median Bias for Income Histogram"
  )

histogram_results_summary %>%
  filter(epsilon > 0.01) %>%
  filter(delta %in% c(NA, 0.000001),
         q %in% c(0.1, 0.5, 0.9)) %>%
  ggplot(aes(factor(epsilon), bias)) +
  geom_line(aes(group = factor(q)), alpha = 0.2) +
  geom_point(aes(color = factor(q)), alpha = 0.4) +
  facet_wrap(~method, scales = "free_y") +
  labs(
    title = "Bias for Income Histogram",
    subtitle = "Selected Percentiles"
  )

histogram_results_summary %>%
  filter(epsilon > 0.01) %>%
  filter(q == 0.5,
         delta %in% c(NA, 0.000001)) %>%
  ggplot(aes(epsilon, rmse)) +
  geom_line(aes(group = method), alpha = 0.2) +
  geom_point(aes(color = method), alpha = 0.4) +
  facet_wrap(~method) +
  labs(
    title = "Median RMSE for Income Histogram"
  )

histogram_results_summary %>%
  filter(epsilon > 0.01) %>%
  filter(delta %in% c(NA, 0.000001),
         q %in% c(0.1, 0.5, 0.9)) %>%
  ggplot(aes(factor(epsilon), rmse)) +
  geom_line(aes(group = factor(q)), alpha = 0.2) +
  geom_point(aes(color = factor(q)), alpha = 0.4) +
  facet_wrap(~method, scales = "free_y") +
  labs(
    title = "RMSE for Income Histogram",
    subtitle = "Selected Percentiles"
  )

Mean Income

No Noise

We calculate the unweighted and weighted mean of income from salaries and wages.

# unweighted mean
true_mean_income <- mean(x = asec$INCWAGE)

# unweight CI
ci_length_original <- t.test(asec$INCWAGE)$conf.int[2] - t.test(asec$INCWAGE)$conf.int[1]

Noisy Results

This table summarizes the results across all methods for means.

mean_income_results <- read_csv(here::here("results", "03_cps_mean-income.csv"), guess_max = 4500)

mean_income_results <- mean_income_results %>%
  mutate(
    noisy_mean = bias + true_mean_income,
    relative_bias = bias / true_mean_income#,
    #`Median Relative CI Length` = median(ci_length / ci_length_original)#,
  )

summary_mean_income_results <- mean_income_results %>%
  group_by(method, epsilon, delta) %>%
  summarize(
    `Median Relative Bias` = median(relative_bias), 
    RMSE = sqrt(mean(bias ^ 2)),
    `Median CI Overlap` = median(ci_overlap, na.rm = TRUE),
    `NAs in CI Overlap` = sum(is.na(ci_overlap)),
  ) %>%
  ungroup()

summary_mean_income_results %>%
  mutate(
    `Median Relative Bias` = round(`Median Relative Bias`, digits = 3),
    RMSE = round(RMSE, digits = 3),
    `Median CI Overlap` = round(`Median CI Overlap`, digits = 3),
    `NAs in CI Overlap` = round(`NAs in CI Overlap`, digits = 0)
  ) %>%
  datatable()
summary_mean_income_results %>%
  filter(epsilon > 0.01) %>%
  filter(delta %in% c(NA, 0.000001)) %>%
  ggplot(aes(epsilon, `Median Relative Bias`)) +
  geom_hline(yintercept = 0, color = "red") +
  geom_line(aes(group = method), alpha = 0.2) +
  geom_point(aes(color = method), alpha = 0.4) +
  labs(title = "Median Relative Bias for Mean Income")

summary_mean_income_results %>%
  filter(epsilon > 0.01) %>%
  filter(delta %in% c(NA, 0.000001)) %>%
  ggplot(aes(epsilon, `Median CI Overlap`)) +
  geom_hline(yintercept = 1, color = "red") +  
  geom_line(aes(group = method), alpha = 0.2) +
  geom_point(aes(color = method), alpha = 0.4) +
  labs(title = "CI Overlap for Mean Income")

summary_mean_income_results %>%
  filter(epsilon > 0.01) %>%
  filter(delta %in% c(NA, 0.000001)) %>%
  ggplot(aes(epsilon, RMSE)) +
  geom_hline(yintercept = 0, color = "red") +  
  geom_line(aes(group = method), alpha = 0.2) +
  geom_point(aes(color = method), alpha = 0.4) +
  labs(title = "RMSE for Mean Income")

Mean Earned Income

No Noise

We calculate the unweighted and weighted mean of income from salaries and wages.

# unweighted mean
true_mean_earned_income <- mean(x = eitc$earned_income_2014)

# unweighted CI
ci_length_original_earned <- t.test(eitc$earned_income_2014)$conf.int[2] - t.test(eitc$earned_income_2014)$conf.int[1]

Noisy Results

This table summarizes the results across all methods for means.

mean_earned_income_results <- read_csv(here::here("results", "03_cps_mean-earned-income.csv"), guess_max = 10000)

mean_earned_income_results <- mean_earned_income_results %>%
  mutate(
    noisy_mean = bias + true_mean_income,
    relative_bias = bias / true_mean_income#,
    #`Median Relative CI Length` = median(ci_length / ci_length_original)#,
  )

summary_mean_earned_income_results <- mean_earned_income_results %>%
  group_by(method, epsilon, delta) %>%
  summarize(
    `Median Relative Bias` = median(relative_bias), 
    RMSE = sqrt(mean(bias ^ 2)),
    `Median CI Overlap` = median(ci_overlap, na.rm = TRUE),
    `NAs in CI Overlap` = sum(is.na(ci_overlap)),
  ) %>%
  ungroup()

summary_mean_income_results %>%
  mutate(
    `Median Relative Bias` = round(`Median Relative Bias`, digits = 3),
    RMSE = round(RMSE, digits = 3),
    `Median CI Overlap` = round(`Median CI Overlap`, digits = 3),
    `NAs in CI Overlap` = round(`NAs in CI Overlap`, digits = 0)
  ) %>%
  datatable()
summary_mean_earned_income_results %>%
  filter(epsilon > 0.01) %>%
  filter(delta %in% c(NA, 0.000001),
         epsilon < 100) %>%
  ggplot(aes(epsilon, `Median Relative Bias`)) +
  geom_hline(yintercept = 0, color = "red") +
  geom_line(aes(group = method), alpha = 0.2) +
  geom_point(aes(color = method), alpha = 0.4) +
  labs(title = "Median Relative Bias for Mean Earned Income")

summary_mean_earned_income_results %>%
  filter(epsilon > 0.01) %>%
  filter(delta %in% c(NA, 0.000001),
         epsilon < 100) %>%
  ggplot(aes(epsilon, `Median CI Overlap`)) +
  geom_hline(yintercept = 1, color = "red") +  
  geom_line(aes(group = method), alpha = 0.2) +
  geom_point(aes(color = method), alpha = 0.4) +
  labs(title = "CI Overlap for Mean Earned Income")

summary_mean_earned_income_results %>%
  filter(epsilon > 0.01) %>%
  filter(delta %in% c(NA, 0.000001),
         epsilon < 100) %>%
  ggplot(aes(epsilon, RMSE)) +
  geom_hline(yintercept = 0, color = "red") +  
  geom_line(aes(group = method), alpha = 0.2) +
  geom_point(aes(color = method), alpha = 0.4) +
  labs(title = "RMSE for Mean Earned Income")

Quantiles

No Noise

We calculate unweighted and weighted percentiles of income from salaries and wages.

percentiles <- seq(from = 0.1, to = 0.9, by = 0.1)

# unweighted percentiles
true_quantile <- quantile(x = asec$INCWAGE, probs = percentiles)

# weighted percentiles
wtd.quantile(x = asec$INCWAGE, weights = asec$ASECWT, probs = percentiles)
##   10%   20%   30%   40%   50%   60%   70%   80%   90% 
##     0     0     0  2500 16000 27000 39000 52000 78000

Noisy Results

With quantile_smooth

quantile_results <- read_csv(here::here("results", "03_cps_quantiles.csv"))

quantile_results <- quantile_results %>%
  mutate(
    relative_bias =  bias / mean(true_quantile)
  )

quantile_results_summary <- quantile_results %>%
  group_by(method, epsilon, delta) %>%
  summarize(
    `Median bias` = median(bias),
    `Median relative bias` = median(relative_bias),
    RMSE = sqrt(mean(bias ^ 2))
    ) %>%
  ungroup()

quantile_results_summary %>%
  filter(epsilon > 0.01) %>%
  filter(delta %in% c(NA, 0.000001),
         epsilon < 100,
         epsilon >= 0.1) %>%
  ggplot(aes(epsilon, `Median relative bias`)) +
  geom_hline(yintercept = 0, color = "red") +
  geom_line(aes(group = method), alpha = 0.2) +
  geom_point(aes(color = method), alpha = 0.4) +
  facet_wrap(~method, scales = "free_y") +
  labs(
    title = "Median Relative Bias for Income Quantiles"
  )

quantile_results_summary %>%
  filter(epsilon > 0.01) %>%
  filter(delta %in% c(NA, 0.000001),
         epsilon < 100,
         epsilon >= 0.1) %>%
  ggplot(aes(epsilon, `Median bias`)) +
  geom_hline(yintercept = 0, color = "red") +  
  geom_line(aes(group = method), alpha = 0.2) +
  geom_point(aes(color = method), alpha = 0.4) +
  facet_wrap(~method, scales = "free_y") +  
  labs(
    title = "Median Bias for Income Quantiles"
  )

quantile_results_summary %>%
  filter(epsilon > 0.01) %>%
  filter(delta %in% c(NA, 0.000001),
         epsilon < 100,
         epsilon >= 0.1) %>%
  ggplot(aes(epsilon, RMSE)) +
  geom_hline(yintercept = 0, color = "red") +  
  geom_line(aes(group = method), alpha = 0.2) +
  geom_point(aes(color = method), alpha = 0.4) +
  facet_wrap(~method, scales = "free_y") +  
  labs(
    title = "RMSE for Income Quantiles"
  )

Without quantile_smooth

quantile_results_summary <- quantile_results_summary %>%
  filter(method != "quantile_smooth")

quantile_results_summary %>%
  filter(epsilon > 0.01) %>%
  filter(delta %in% c(NA, 0.000001),
         epsilon < 100,
         epsilon >= 0.1) %>%
  ggplot(aes(epsilon, `Median relative bias`)) +
  geom_hline(yintercept = 0, color = "red") +
  geom_line(aes(group = method), alpha = 0.2) +
  geom_point(aes(color = method), alpha = 0.4) +
  facet_wrap(~method) +
  labs(
    title = "Median Relative Bias for Income Quantiles"
  )

quantile_results_summary %>%
  filter(epsilon > 0.01) %>%
  filter(delta %in% c(NA, 0.000001),
         epsilon < 100,
         epsilon >= 0.1) %>%
  ggplot(aes(epsilon, `Median bias`)) +
  geom_hline(yintercept = 0, color = "red") +  
  geom_line(aes(group = method), alpha = 0.2) +
  geom_point(aes(color = method), alpha = 0.4) +
  facet_wrap(~method) +  
  labs(
    title = "Median Bias for Income Quantiles"
  )

quantile_results_summary %>%
  filter(epsilon > 0.01) %>%
  filter(delta %in% c(NA, 0.000001),
         epsilon < 100,
         epsilon >= 0.1) %>%
  ggplot(aes(epsilon, RMSE)) +
  geom_hline(yintercept = 0, color = "red") +  
  geom_line(aes(group = method), alpha = 0.2) +
  geom_point(aes(color = method), alpha = 0.4) +
  facet_wrap(~method) +  
  labs(
    title = "RMSE for Income Quantiles"
  )