Form: 990ez Tax year: 2020 Generated: 2026-05-11 17:08:55
✅ PASSED — Strict mode: TRUE
Executive Summary
Row count
210,600
Column count
76
Vintage-aware completeness
99.53%
Raw completeness (all cols / all rows)
99.53%
Unique EINs
210,226
Duplicate EINs (amendments)
374
Missing columns vs schema
0
Extra columns vs schema
0
YoY row-count delta vs prior run
no_baseline
Identity-Field Checks
Code
ein <- r$ critical_field_issues$ ein
tp <- r$ critical_field_issues$ tax_period
sub <- r$ critical_field_issues$ subsection
typ <- r$ critical_field_issues$ types
ident <- data.table (
Check = c ("EIN format (XX-XXXXXXX)" , "EIN null count" , "EIN duplicates (soft)" ,
"tax_period format (YYYYMM)" , "tax_period out-of-range" ,
"subsection_cd whitelist" , "Column types" ),
Result = c (
if (is.null (ein)) "✅ all valid" else sprintf ("❌ %d malformed" , ein$ malformed),
if (is.null (ein) && r$ summary_stats$ duplicate_eins >= 0 ) "✅ no nulls" else sprintf ("❌ %d nulls" , ein$ null_count),
sprintf ("ℹ %d (soft, not a failure)" , r$ summary_stats$ duplicate_eins),
if (is.null (tp)) "✅ all valid" else sprintf ("❌ %d malformed" , tp$ malformed_count),
if (is.null (tp)) "✅ all in range" else sprintf ("❌ %d out of range" , tp$ out_of_range_count),
if (is.null (sub)) "✅ all in IRM whitelist" else sprintf ("❌ %d unknown codes" , sub$ unknown_count),
if (is.null (typ)) "✅ all types correct" else sprintf ("❌ %s" , paste (typ$ issues, collapse = "; " ))
)
)
kable (ident)
EIN format (XX-XXXXXXX)
✅ all valid
EIN null count
✅ no nulls
EIN duplicates (soft)
ℹ 374 (soft, not a failure)
tax_period format (YYYYMM)
✅ all valid
tax_period out-of-range
✅ all in range
subsection_cd whitelist
✅ all in IRM whitelist
Column types
✅ all types correct
Subsection Code Distribution
(IRC 501(c) / 501(d) / 501(e) / 501(f) / 501(k) / 501(n) / 521 / 527 / 529 / 4947 / 1381 subsection codes per IRM 25.7.1. See data/lookups/subsection_codes.csv.)
Code
sd <- r$ summary_stats$ subsection_distribution
if (length (sd$ top_values) == 0 L) {
cat ("(no subsection data)" )
} else {
tab <- rbindlist (lapply (sd$ top_values, as.data.table))
setnames (tab, c ("code" , "count" , "pct" ))
tab[, pct : = paste0 (pct, "%" )]
tab[, count : = format (count, big.mark = "," )]
kable (tab)
}
3
162,111
76.98%
6
12,934
6.14%
4
11,806
5.61%
7
7,568
3.59%
5
7,506
3.56%
8
2,785
1.32%
19
2,741
1.3%
13
908
0.43%
12
732
0.35%
10
588
0.28%
Financial Summary
Code
fin <- r$ summary_stats$ financial
if (length (fin) == 0 L) {
cat ("(no financial summary)" )
} else {
tab <- data.table (
metric = c ("total_revenue" , "total_expenses" , "total_assets_eoy" ),
sum = c (fmt_money (fin$ revenue$ total), fmt_money (fin$ expenses$ total), fmt_money (fin$ assets$ total)),
median = c (fmt_money (fin$ revenue$ median), fmt_money (fin$ expenses$ median), fmt_money (fin$ assets$ median))
)
kable (tab)
}
total_revenue
$11,767,877,822
$45,091
total_expenses
$10,994,321,643
$40,000
total_assets_eoy
$18,238,347,945
$48,518
Tax Period Year Coverage
Code
d <- r$ summary_stats$ tax_period_year_distribution
if (length (d) == 0 L) {
cat ("(no coverage data)" )
} else {
tab <- data.table (tax_year = names (d), n = as.integer (unlist (d)))
setorder (tab, tax_year)
tab[, pct : = sprintf ("%.2f%%" , 100 * n / sum (n))]
tab[, n : = format (n, big.mark = "," )]
kable (tab)
}
Field Completeness by Category
Columns are grouped by Form section parsed from the crosswalk’s location field.
Code
cats <- r$ category_reports
if (length (cats) == 0 L) {
cat ("(no category breakdown)" )
} else {
tab <- rbindlist (lapply (cats, function (c) data.table (
category = c$ category_name,
n_cols = c$ column_count,
cols_present = c$ columns_present,
avg_completeness = sprintf ("%.2f%%" , c$ avg_completeness)
)))
setorder (tab, - n_cols)
kable (tab)
}
sched_a
32
32
99.30%
part_i
20
20
100.00%
part_v
13
13
99.93%
header
3
3
100.00%
part_ii
3
3
100.00%
other
1
1
87.48%
Completeness by Vintage Cohort
When a single tax-year file blends rows from multiple extract_year × source_form cohorts, the per-cohort completeness reveals whether any vintage is dragging the overall metric down. A clean run produces 100% for every cohort.
Code
pc <- r$ completeness_by_cohort
if (length (pc) == 0 L) {
cat ("(single cohort)" )
} else {
tab <- rbindlist (lapply (pc, function (c) data.table (
extract_year = c$ extract_year,
source_form = c$ source_form,
rows = format (c$ n_rows, big.mark = "," ),
expected_cols = c$ n_expected_cols,
completeness_pct = sprintf ("%.2f%%" , c$ completeness_pct)
)))
kable (tab)
}
2020
990ez
26,377
76
98.35%
2021
990ez
137,878
76
99.71%
2022
990ez
39,557
76
99.71%
2023
990ez
5,084
76
99.79%
2024
990ez
1,704
76
98.20%
Data Issues
Code
issues <- character (0 )
if (length (r$ missing_columns)) issues <- c (issues, sprintf ("Missing %d columns: %s" ,
length (r$ missing_columns), paste (head (r$ missing_columns, 10 ), collapse = ", " )))
if (length (r$ extra_columns)) issues <- c (issues, sprintf ("Extra %d columns: %s" ,
length (r$ extra_columns), paste (head (r$ extra_columns, 10 ), collapse = ", " )))
if (length (r$ critical_field_issues)) {
for (k in names (r$ critical_field_issues)) {
issues <- c (issues, sprintf ("%s: %s" , k, paste (unlist (r$ critical_field_issues[[k]]), collapse = " | " )))
}
}
if (length (issues) == 0 L) {
cat ("✅ No hard-check issues detected. \n " )
} else {
for (i in issues) cat ("- " , i, " \n " , sep = "" )
}
✅ No hard-check issues detected.
Generated from /mnt/c/Users/tpoongundranar/Documents/Urban/NCCS/nccs-data-core/data/logs/quality_990ez_2020.rds. See R/quality/post_checks.R for the validator implementation and docs/05-quality-gates.qmd for the gate definitions.