library( postmastr )
library( dplyr )
12 Address Validation
Survey respondents frequently enter address fields that are invalid because they are abbreviated, misspelled, out of order, etc.
In this section, we’ll parse the addresses from the survey using the postmastr package for that was designed to facilitate turning raw addresses into the standardized versions used by the US Post Office.
12.1 Process Address Columns using Postmastr
Our survey data set has 3 columns with address information:
1. Addr_ZIP: Postal Codes
2. Addr_CityState : City and State
3. Addr_StreetLocation: Street Names
We first create a row.id column so that we can later merge our postmastr results back with the survey dataset.
<-
survey_df %>%
survey_df ::mutate( row.id = dplyr::row_number() ) dplyr
12.1.1 Postal Codes
12.1.1.1 Process Postal Code Column
We first explore the postal codes in Addr_ZIP.
<-
zipcodes %>%
survey_df ::select( row.id, Addr_ZIP )
dplyr
<- postmastr::pm_identify( zipcodes, var = "Addr_ZIP" )
zip_id <- postmastr::pm_prep( zip_id, var = "Addr_ZIP", type = "zip" ) zip_prep
Our postmastr object returns FALSE when asked if all ZIP codes in the survey data set are valid. This requires additional exploration.
<-
zip_detect ::pm_postal_detect( zip_prep ) %>%
postmastr::filter( pm.hasZip == FALSE ) dplyr
pm_postal_detect() reveals that the survey responses without a valid ZIP code have the following values: -99, which indicate missingness. We can thus remove them from our postal code data set. pm_postal_detect() reveals that the survey responses without a valid ZIP code have the following values: , which indicate missingness. We can thus remove them from our postal code data set.
<-
zipcodes %>%
survey_df ::select( row.id, Addr_ZIP ) %>%
dplyr::filter( ! Addr_ZIP %in% c(NA,"-99") )
dplyr
<- postmastr::pm_identify( zipcodes, var = "Addr_ZIP" )
zip_id <- postmastr::pm_prep( zip_id, var = "Addr_ZIP", type = "zip" ) zip_prep
Our postmastr object now returns FALSE when asked if all ZIP codes in the survey data set are valid.
12.1.1.2 Parsing Zip Codes
We can now parse and process all postal codes. After parsing, we left join our results with the original dataset to keep row.ids.
<- postmastr::pm_postal_parse( zip_prep )
zip_parsed
<-
zip_parsed %>%
zip_id ::left_join( zip_parsed ) %>%
tidylog::select( row.id, pm.zip, pm.zip4 ) dplyr
12.1.2 City and State
We can now repeat the process for Addr_CityState.
12.1.2.1 Processing Data
<- postmastr::pm_dictionary( type = "state" )
statedict
<- survey_df %>%
survey_addr_citystate ::select(row.id, Addr_CityState) %>%
dplyr::filter(!Addr_CityState %in% c(NA, "-99")) %>%
dplyr::mutate(
dplyrAddr_CityState = gsub("\\.", "", Addr_CityState),
Addr_CityState = gsub("Falls ChurchVA 22042", "Falls Church, VA", Addr_CityState),
Addr_CityState = gsub("Sonoma cA", "Sonoma, CA", Addr_CityState)
)
<-
postmastr_citystate_id ::pm_identify( survey_addr_citystate,
postmastrvar = "Addr_CityState" )
<-
postmastr_citystate_prep ::pm_prep( postmastr_citystate_id,
postmastrvar = "Addr_CityState",
type = "state")
Our postmastr object now returns FALSE when asked if all states in the Addr_CityState column are valid.
We next examine the rows without a valid state.
pm_state_none(postmastr_citystate_prep)
# A tibble: 253 × 2
pm.uid pm.address
<int> <chr>
1 2 Winthrop
2 6 Aurora
3 11 Jackson Heights
4 19 OSSINING
5 21 Monticello
6 22 San Diego
7 23 NEW HARTFORD
8 25 Earlville
9 27 Rochester
10 28 Niagara Falls
# ℹ 243 more rows
This data set contains some title case state abbreviations that are undetected. We can append them to our state dictionary. We can then check the unmatched variables to double-check if any states were missed.
<-
states_append ::pm_append(
postmastrtype = "state",
input = c(
"Fl",
"North Caroling",
"Tx",
"Ca",
"Falls ChurchVA 22042",
"GA 30014",
"NV.",
"Ky",
"Ma",
"CO.",
"OaklandCA",
"Mi",
"cA",
"Ny"
),output = c(
"FL",
"NC",
"TX",
"CA",
"VA",
"GA",
"NV",
"KY",
"MA",
"CO",
"CA",
"MI",
"CA",
"NY"
),locale = "us"
)
<-
statedict ::pm_dictionary( type = "state", append = states_append )
postmastr
pm_state_none( postmastr_citystate_prep, dictionary = statedict )
# A tibble: 242 × 2
pm.uid pm.address
<int> <chr>
1 2 Winthrop
2 6 Aurora
3 11 Jackson Heights
4 19 OSSINING
5 21 Monticello
6 22 San Diego
7 23 NEW HARTFORD
8 25 Earlville
9 27 Rochester
10 28 Niagara Falls
# ℹ 232 more rows
12.1.2.2 Parsing State Data
Once processing is complete, we can parse States from the Addr_CityState column.
<-
postmastr_state_parsed ::pm_state_parse( postmastr_citystate_prep,
postmastrdictionary = statedict )
postmastr_state_parsed
# A tibble: 567 × 3
pm.uid pm.address pm.state
<int> <chr> <chr>
1 1 WINTHROP ME
2 2 Winthrop <NA>
3 3 Westbrook ME
4 4 Port Angeles WA
5 5 Portland ME
6 6 Aurora <NA>
7 7 Bethpage NY
8 8 Brooklyn NY
9 9 Hauppauge NY
10 10 Media PA
# ℹ 557 more rows
12.1.2.3 Processing City Data
We repeat the same process for city data. We include a filter for the states identified after parsing with Postmastr.
<-
states unique( postmastr_state_parsed$pm.state ) %>%
na.omit()
<- postmastr::pm_dictionary( type = "city", filter = states )
citydict
pm_city_none( postmastr_state_parsed, dictionary = citydict )
# A tibble: 46 × 3
pm.uid pm.address pm.state
<int> <chr> <chr>
1 56 Saint Clair Shores <NA>
2 57 St Louis MO
3 68 Westbrookville <NA>
4 72 Wernersville PA
5 73 Newfoundland PA
6 77 Dillsburg <NA>
7 84 Lititz PA
8 92 Indianapolis IN
9 100 Pacoima <NA>
10 108 Sonoma cA <NA>
# ℹ 36 more rows
After processing state names, our postmastr object contains some unmatched cities. We create an additional city-level dictionary to append to our current city-level dictionary. We coerce valid city names to NA values to let postmastr know that these inputs are valid.
CURRENT | REPLACEMENT_VALUE | |
---|---|---|
44 | Sonoma | NA |
45 | Williston Pk | Williston Park |
46 | Wernersville | NA |
47 | St Louis | St. Louis |
48 | La Canada Flintridge | NA |
49 | Falls Church | Falls Church |
<-
city_append ::pm_append(
postmastrtype = "city",
input = input.x,
output = output.x
)
<-
citydict ::pm_dictionary( type = "city",
postmastrfilter = states,
append = city_append )
pm_city_none( postmastr_state_parsed, dictionary = citydict )
# A tibble: 5 × 3
pm.uid pm.address pm.state
<int> <chr> <chr>
1 108 "Sonoma cA" <NA>
2 127 "Falls ChurchVA 22042" <NA>
3 146 "AHMEDABAD" <NA>
4 265 "Tay Ho Ha Noi" <NA>
5 565 "" <NA>
The unmatched cities in our data set are not from the US and will be ignored.
<-
postmastr_citystate_parsed ::pm_city_parse( postmastr_state_parsed, dictionary = citydict )
postmastr
<- postmastr_citystate_parsed %>%
citystate_parsed ::left_join( postmastr_citystate_id ) %>%
tidylog::select( row.id, pm.city, pm.state ) dplyr
12.1.3 Street Names
12.1.3.1 Prepare Data
The final column to process is Addr_StreetLocation which contains street names.
<- survey_df %>%
survey_addr_street ::select(row.id, Addr_StreetLocation) %>%
dplyr::filter(! Addr_StreetLocation %in% c("-99", NA))
dplyr
<-
postmastr_street_id ::pm_identify( survey_addr_street,
postmastrvar = "Addr_StreetLocation" )
<-
postmastr_street_prep ::pm_prep( postmastr_street_id,
postmastrvar = "Addr_StreetLocation",
type = "street" )
head( postmastr_street_prep, 10 )
# A tibble: 10 × 2
pm.uid pm.address
<int> <chr>
1 1 305 WINTHROP CENTER ROAD
2 2 188 CASE RD
3 3 15 Saunders Way #500D
4 4 819 Georgiana St.
5 5 P.O. Box 761
6 6 712 S. River St. STE H
7 7 15 Grumman Road west Suite 1000
8 8 1000 Dean Street Suite 420
9 9 10 Davids Drive
10 10 144 Longview Circle
12.1.3.2 Parse House Numbers
Next, we extract house/unit numbers from the street addresses.
<- postmastr::pm_house_parse( postmastr_street_prep )
postmastr_house_parsed head( postmastr_house_parsed, 10 )
# A tibble: 10 × 3
pm.uid pm.address pm.house
<int> <chr> <chr>
1 1 WINTHROP CENTER ROAD 305
2 2 CASE RD 188
3 3 Saunders Way #500D 15
4 4 Georgiana St. 819
5 5 P.O. Box 761 <NA>
6 6 S. River St. STE H 712
7 7 Grumman Road west Suite 1000 15
8 8 Dean Street Suite 420 1000
9 9 Davids Drive 10
10 10 Longview Circle 144
12.1.3.3 Parse Street Prefix and Suffix
To parse street prefixes and suffixes requires 2 dictionaries. 1 with directions and 1 with conversions for suffixes like “street” or “boulevard”.
<- pm_dictionary( type = "directional", locale = "us" )
dirs
<-
postmastr_streetdir_parsed ::pm_streetDir_parse( postmastr_house_parsed, dictionary = dirs )
postmastr
<-
postmastr_streetSuf_parsed ::pm_streetSuf_parse( postmastr_streetdir_parsed ) postmastr
12.1.3.4 Parse Street Names
The final element for parsing is the street names themselves.
<-
postmaster_street_parsed ::pm_street_parse( postmastr_streetSuf_parsed,
postmastrordinal = TRUE,
drop = TRUE)
postmaster_street_parsed
# A tibble: 665 × 6
pm.uid pm.house pm.preDir pm.street pm.streetSuf pm.sufDir
<int> <chr> <chr> <chr> <chr> <chr>
1 1 305 <NA> Winthrop Center Rd <NA>
2 2 188 <NA> Case Rd <NA>
3 3 15 <NA> Saunders Way #500d <NA> <NA>
4 4 819 <NA> Georgiana St <NA> <NA>
5 5 <NA> <NA> Po. Box 761 <NA> <NA>
6 6 712 S River St Ste H <NA> <NA>
7 7 15 <NA> Grumman Road West Suite 1000 <NA> <NA>
8 8 1000 <NA> Dean Street Suite 420 <NA> <NA>
9 9 10 <NA> Davids Dr <NA>
10 10 144 <NA> Longview Cir <NA>
# ℹ 655 more rows
12.1.3.5 Combine Street Components Into Single Column
After parsing each component, we concatenate them into a single street address.
<-
postmastr_street_full_parsed ::pm_replace( postmaster_street_parsed,
postmastrsource = postmastr_street_id )
<- rlang::quo( !! rlang::sym("pm.sufDir")
endQ
)
<-
postmastr_street_full_parsed ::pm_rebuild( postmastr_street_full_parsed,
postmastroutput = "short",
keep_ids = TRUE )
<- postmastr_street_full_parsed %>%
street_parsed ::left_join(postmastr_street_id) %>%
tidylog::select(row.id, pm.address) dplyr
12.1.4 Combine All Parsed Addresses Components Together
And finally, we concatenate our postal code, city, state and street address into a single f_address column for geocoding.
<- street_parsed %>%
postmastr_faddress_parsed ::left_join(citystate_parsed) %>%
tidylog::left_join(zip_parsed) %>%
tidylog::unite( f_address, pm.address,
tidyr
pm.city, pm.state,
pm.zip, pm.zip4, na.rm = TRUE, sep = ", " )
head(postmastr_faddress_parsed, 3) %>% knitr::kable()
row.id | f_address |
---|---|
1 | 305 Winthrop Center Rd, WINTHROP, ME, 04364 |
2 | 188 Case Rd, Winthrop, 04364 |
3 | 15 Saunders Way #500d, Westbrook, ME, 04092 |
The f_address column now has parsed addresses that can be formatted using a geocoder of choice. We can now rejoin the parsed address data via row.id from our original data set.
<-
survey_geoproc_df %>%
survey_df ::left_join( postmastr_faddress_parsed, by = "row.id" ) tidylog