flowchart TB
subgraph Pre["Pre-Transformation"]
P1[Load Raw Data]
P2{Column Check}
P3[Row Count Recording]
P4[Duplicate Check]
P5[NULL Inventory]
P6[Generate Pre-Report]
end
subgraph Transform["Transformation"]
T1[Per-Transform Validation]
T2[Lookup Join Validation]
T3[Type Conversion Checks]
end
subgraph Post["Post-Transformation"]
Q1[Row Preservation Check]
Q2[Completeness Analysis]
Q3[Critical Field Validation]
Q4[Generate Quality Report]
end
P1 --> P2
P2 -->|Pass| P3
P2 -->|Fail| STOP1[STOP: Missing Columns]
P3 --> P4
P4 --> P5
P5 --> P6
P6 --> T1
T1 --> T2
T2 --> T3
T3 --> Q1
Q1 --> Q2
Q2 --> Q3
Q3 --> Q4
Q4 --> OUTPUT[Quality Report JSON]
6 Quality Gates
7 Quality Assurance System
The BMF pipeline includes a comprehensive quality assurance system with pre-transformation and post-transformation validation gates.
7.1 Quality Flow
8 Pre-Transformation Validation
File: R/quality/pre_checks.R
Function: validate_raw_bmf_structure()
8.1 Checks Performed
8.1.1 1. Required Columns
Validates that all 31 expected columns exist in the raw BMF:
BMF_REQUIRED_COLUMNS <- c(
"EIN", "NAME", "ICO", "STREET", "CITY", "STATE", "ZIP",
"GROUP", "SUBSECTION", "AFFILIATION", "CLASSIFICATION",
"RULING", "DEDUCTIBILITY", "FOUNDATION", "ACTIVITY",
"ORGANIZATION", "STATUS", "TAX_PERIOD", "ASSET_CD",
"INCOME_CD", "FILING_REQ_CD", "PF_FILING_REQ_CD",
"ASSET_AMT", "INCOME_AMT", "REVENUE_AMT", "NTEE_CD",
"SORT_NAME", "ACCT_PD", "REGION", "RYEAR", "ID"
)Action on failure: STOP (strict mode) or WARN
8.1.2 2. Row Count
Records the row count for comparison during post-transformation validation.
Action: Informational - row count is logged and stored for later comparison
8.1.3 3. Duplicate EINs
Counts duplicate EIN values across the dataset.
Action: Report count (informational - duplicates may be expected from regional file overlaps)
8.1.4 4. NULL Inventory
Reports NULL/empty counts for each required column.
Action: Flag columns with >10% missing as potential data quality issues
8.2 Usage
pre_results <- validate_raw_bmf_structure(
bmf_raw,
required_cols = BMF_REQUIRED_COLUMNS,
strict = TRUE # Stop on failure
)8.3 Output
========================================
PRE-TRANSFORMATION VALIDATION RESULTS
========================================
Timestamp: 2025-01-15 10:30:00
Row count: 1,898,175
Column count: 31
Missing columns: 0
Duplicate EINs: 0
Columns with >10% missing values:
- TAX_PERIOD: 404,425 (21.3%)
- ASSET_AMT: 424,064 (22.3%)
OVERALL: PASSED
========================================
9 Per-Transform Validation
Each transform function includes built-in validation:
9.1 Input Validation
validate_data_table(dt, required_cols, context)Checks: - Input is a data.table - Required columns exist - Fails fast with descriptive error message
9.2 Lookup Validation
validate_lookup(lookup, required_cols, lookup_name)Checks: - Lookup table has required columns - Lookup table is not empty
9.3 Join Success Validation
validate_join_success(dt, key_col, result_col, lookup_name)Checks: - Counts rows where lookup produced NULL - Warns if unmatched rate > 1% - Reports sample of unmatched keys
9.4 Type Conversion Validation
validate_type_conversion(original, converted, col_name)Checks: - No non-NA values became NA during conversion - Reports failed conversion count
10 Post-Transformation Validation
File: R/quality/post_checks.R
Function: generate_quality_report()
10.1 Checks Performed
10.1.1 1. Row Preservation
Ensures input row count equals output row count:
if (nrow(dt) != pre_check_results$row_count) {
report$row_preservation <- FALSE
report$passed <- FALSE
}10.1.2 2. Expected Columns
Validates all expected output columns exist:
BMF_OUTPUT_COLUMNS <- c(
"ein", "ein_raw",
"org_name_raw", "org_name_join", "org_name_display", "org_legal_suffix",
"org_parent_name", "dba_name", "dba_name_raw",
"in_care_of_name_raw", "in_care_of_name_clean", "in_care_of_name_provided",
# ... ~76 columns total covering identity, address, classification,
# codes, dates, financial, activity, filing, and NTEE fields
)10.1.3 3. Completeness Rates
Calculates percentage of non-NULL values for each column:
completeness <- sapply(present_cols, function(col) {
100 * sum(!is.na(dt[[col]]) & dt[[col]] != "") / nrow(dt)
})10.1.4 4. Critical Field Validation
Ensures critical fields have no NULLs:
CRITICAL_FIELDS <- c("ein")Action on failure: Mark report as FAILED
10.1.5 5. Summary Statistics
Generates key metrics: - Unique EIN count - Subsection distribution - Organization type distribution - Total assets/median assets
10.2 Quality Report Structure
report <- list(
passed = TRUE,
timestamp = Sys.time(),
row_count = nrow(dt),
column_count = ncol(dt),
row_preservation = TRUE,
missing_columns = character(0),
extra_columns = character(0),
overall_completeness = 0,
critical_field_issues = list(),
category_reports = list(),
summary_stats = list()
)10.3 Output Functions
10.3.1 Print to Console
print_quality_report(report)================================================================================
POST-TRANSFORMATION QUALITY REPORT
================================================================================
Timestamp: 2025-01-15 11:45:00
Final row count: 1,898,175
Final column count: 76
Overall completeness: 94.5%
Row preservation: PASSED
--------------------------------------------------------------------------------
COMPLETENESS BY CATEGORY
--------------------------------------------------------------------------------
[Identity] (2 columns, avg completeness: 100.0%)
All columns 100% complete
[Financial Amounts] (3 columns, avg completeness: 77.7%)
- asset_amount: 77.7%
- income_amount: 77.7%
--------------------------------------------------------------------------------
SUMMARY STATISTICS
--------------------------------------------------------------------------------
Identity:
Unique EINs: 1,898,175
Financial:
Total Assets: $12,345,678,901,234
Median Assets: $50,000
================================================================================
OVERALL RESULT: PASSED
================================================================================
10.3.2 Save to JSON
save_quality_report(report, "data/quality/bmf_2025_quality_report.json")10.3.3 Render to HTML/PDF
render_quality_report(report, "data/quality/bmf_2025_quality_report.html")
render_quality_report(report, "data/quality/bmf_2025_quality_report.pdf", format = "pdf")The HTML/PDF report includes:
- Executive Summary: Pass/fail status, row count, timestamp
- Field Completeness Table: Sortable table showing completeness % per field
- Completeness Chart: Bar chart highlighting fields with incomplete data
- Subsection Distribution: Bar chart showing organizations by IRS subsection
- Organization Type Distribution: Bar chart showing top 15 org types
- Financial Summary: Total and median asset amounts
Requirements: - quarto R package: install.packages("quarto") - Quarto CLI: https://quarto.org/docs/get-started/ - For PDF output: LaTeX distribution (e.g., TinyTeX)
11 Troubleshooting Common Failures
11.1 Pre-Validation Failures
11.1.1 “Missing required columns”
Cause: IRS changed the BMF schema
Fix: 1. Check IRS documentation for schema changes 2. Update BMF_REQUIRED_COLUMNS in pre_checks.R 3. Add/modify transforms for new columns
11.1.2 “Row count changed during transformation”
Cause: A transformation step unexpectedly added or removed rows
Fix: 1. Check transformation logs to identify which step changed row count 2. Review the specific transform function for bugs 3. Ensure joins use appropriate join type (should not drop rows)
11.2 Transform Failures
11.2.1 “Lookup join failed for X values”
Cause: New codes in BMF not in lookup table
Fix: 1. Check which codes are unmatched: r dt[is.na(definition_col), unique(code_col)] 2. Update lookup table in data/lookup/ 3. Re-run pipeline
11.2.2 “Type conversion failed for X values”
Cause: Unexpected data format in source
Fix: 1. Examine failed values: r dt[is.na(output_col) & !is.na(input_col), unique(input_col)] 2. Update transform to handle edge case 3. Re-run from checkpoint
11.3 Post-Validation Failures
11.3.1 “Row count changed during transformation”
Cause: Transform added/removed rows unexpectedly
Fix: 1. Check each transform’s row count 2. Identify which transform changed row count 3. Fix transform logic or update expected behavior
11.3.2 “Critical field has NULL values”
Cause: Transform failed to populate required field
Fix: 1. Trace back to transform that creates the field 2. Check input data and lookup tables 3. Fix transform or data issue
12 Validation Functions Reference
12.1 validate_data_table()
validate_data_table(dt, required_cols, context = "data")| Parameter | Description |
|---|---|
dt |
Object to validate |
required_cols |
Character vector of required column names |
context |
String for error messages (e.g., “BMF data”) |
Throws: Error if validation fails
12.2 validate_lookup()
validate_lookup(lookup, required_cols, lookup_name)| Parameter | Description |
|---|---|
lookup |
Lookup table to validate |
required_cols |
Required columns |
lookup_name |
Name for error messages |
Throws: Error if validation fails
12.3 validate_join_success()
validate_join_success(dt, key_col, result_col, lookup_name)| Parameter | Description |
|---|---|
dt |
Table after join |
key_col |
Join key column |
result_col |
Column that should have been populated |
lookup_name |
Lookup table name for messages |
Warns: If >1% unmatched
12.4 validate_step()
validate_step(dt_before, dt_after, expected_new_cols, step_name)Validates a single transformation step: - Row count preserved - Expected columns added
12.5 render_quality_report()
render_quality_report(report, output_path, format = c("html", "pdf"), open = FALSE)| Parameter | Description |
|---|---|
report |
Quality report list from generate_quality_report() |
output_path |
Path for output file (e.g., “data/quality/report.html”) |
format |
Output format: “html” (default) or “pdf” |
open |
If TRUE, open report in browser after rendering |
Returns: Path to the rendered report file (invisibly)
13 Quality Report JSON Schema
{
"passed": true,
"timestamp": "2025-01-15T11:45:00",
"row_count": 1898175,
"column_count": 76,
"row_preservation": true,
"missing_columns": [],
"extra_columns": [],
"overall_completeness": 94.5,
"critical_field_issues": {},
"category_reports": {
"identity": {
"category_name": "Identity",
"category_key": "identity",
"column_count": 2,
"columns_present": 2,
"avg_completeness": 100.0,
"columns": {
"ein": {
"column_name": "ein",
"column_type": "character",
"present": true,
"completeness": {
"total": 1898175,
"non_null": 1898175,
"null_count": 0,
"empty_count": 0,
"completeness_pct": 100.0,
"missing_pct": 0.0
},
"type_stats": {
"unique_count": 1898175,
"min_length": 10,
"max_length": 10,
"avg_length": 10.0,
"valid_count": 1898175
}
}
}
},
"financial_amounts": {
"category_name": "Financial Amounts",
"column_count": 3,
"avg_completeness": 77.7,
"columns": {}
}
},
"column_completeness": {
"ein": {
"completeness_pct": 100.0,
"source_columns": "EIN",
"null_count": 0,
"empty_count": 0
},
"asset_amount": {
"completeness_pct": 77.7,
"source_columns": "ASSET_AMT",
"null_count": 424064,
"empty_count": 0
}
},
"summary_stats": {
"unique_eins": 1898175,
"duplicate_eins": 0,
"unique_org_names": 1850000,
"financial": {
"total_assets": 12345678901234,
"median_assets": 50000,
"orgs_with_assets": 1474111,
"orgs_zero_assets": 125000,
"total_income": 9876543210987,
"median_income": 25000,
"total_revenue": 8765432109876,
"median_revenue": 20000
},
"address_quality": {
"missing_count": 15000,
"missing_pct": 0.79,
"po_box_count": 250000,
"po_box_pct": 13.17,
"rural_route_count": 5000,
"invalid_state_count": 100
},
"ruling_date_range": {
"earliest": "1940-01-01",
"latest": "2025-01-15",
"orgs_with_ruling_date": 1800000
},
"org_type_distribution": [
{"exempt_organization_type": "501(c)(3)", "N": 1234567},
{"exempt_organization_type": "501(c)(4)", "N": 234567}
],
"ntee_major_group_distribution": [
{"ntee_code_major_group": "N", "N": 150000},
{"ntee_code_major_group": "P", "N": 120000}
],
"tax_period_year_distribution": [
{"tax_year": "2024", "N": 1500000},
{"tax_year": "2023", "N": 398175}
]
},
"source_column_map": {
"ein": "EIN",
"org_name_raw": "NAME",
"asset_amount": "ASSET_AMT"
},
"column_descriptions": {
"ein": "Employer Identification Number formatted as XX-XXXXXXX",
"asset_amount": "Total assets in dollars (most recent return)"
}
}