12  Address Validation

Survey respondents frequently enter address fields that are invalid because they are abbreviated, misspelled, out of order, etc.

In this section, we’ll parse the addresses from the survey using the postmastr package for that was designed to facilitate turning raw addresses into the standardized versions used by the US Post Office.

library( postmastr )
library( dplyr )

12.1 Process Address Columns using Postmastr

Our survey data set has 3 columns with address information:

1. Addr_ZIP: Postal Codes 
2. Addr_CityState : City and State 
3. Addr_StreetLocation: Street Names

We first create a row.id column so that we can later merge our postmastr results back with the survey dataset.

survey_df <- 
  survey_df %>% 
  dplyr::mutate( row.id = dplyr::row_number() )

12.1.1 Postal Codes

12.1.1.1 Process Postal Code Column

We first explore the postal codes in Addr_ZIP.

zipcodes <- 
  survey_df %>% 
  dplyr::select( row.id, Addr_ZIP )

zip_id   <- postmastr::pm_identify( zipcodes, var = "Addr_ZIP" )
zip_prep <- postmastr::pm_prep( zip_id, var = "Addr_ZIP", type = "zip" )

Our postmastr object returns FALSE when asked if all ZIP codes in the survey data set are valid. This requires additional exploration.

zip_detect <- 
  postmastr::pm_postal_detect( zip_prep ) %>% 
  dplyr::filter( pm.hasZip == FALSE )

pm_postal_detect() reveals that the survey responses without a valid ZIP code have the following values: -99, which indicate missingness. We can thus remove them from our postal code data set. pm_postal_detect() reveals that the survey responses without a valid ZIP code have the following values: , which indicate missingness. We can thus remove them from our postal code data set.

zipcodes <- 
  survey_df %>% 
  dplyr::select( row.id, Addr_ZIP ) %>% 
  dplyr::filter( ! Addr_ZIP %in% c(NA,"-99") )

zip_id   <- postmastr::pm_identify( zipcodes, var = "Addr_ZIP" )
zip_prep <- postmastr::pm_prep( zip_id, var = "Addr_ZIP", type = "zip" )

Our postmastr object now returns FALSE when asked if all ZIP codes in the survey data set are valid.

12.1.1.2 Parsing Zip Codes

We can now parse and process all postal codes. After parsing, we left join our results with the original dataset to keep row.ids.

zip_parsed <- postmastr::pm_postal_parse( zip_prep )

zip_parsed <- 
  zip_id %>% 
  tidylog::left_join( zip_parsed ) %>% 
  dplyr::select( row.id, pm.zip, pm.zip4 )

12.1.2 City and State

We can now repeat the process for Addr_CityState.

12.1.2.1 Processing Data

statedict <- postmastr::pm_dictionary( type = "state" )

survey_addr_citystate <- survey_df %>%
  dplyr::select(row.id, Addr_CityState) %>%
  dplyr::filter(!Addr_CityState %in% c(NA, "-99")) %>%
  dplyr::mutate(
    Addr_CityState = gsub("\\.", "", Addr_CityState),
    Addr_CityState = gsub("Falls ChurchVA 22042", "Falls Church, VA", Addr_CityState),
    Addr_CityState = gsub("Sonoma cA", "Sonoma, CA", Addr_CityState)
  )

postmastr_citystate_id <- 
  postmastr::pm_identify( survey_addr_citystate, 
                          var = "Addr_CityState" )

postmastr_citystate_prep <- 
  postmastr::pm_prep( postmastr_citystate_id, 
                      var = "Addr_CityState", 
                      type = "state")

Our postmastr object now returns FALSE when asked if all states in the Addr_CityState column are valid.

We next examine the rows without a valid state.

pm_state_none(postmastr_citystate_prep)
# A tibble: 253 × 2
   pm.uid pm.address     
    <int> <chr>          
 1      2 Winthrop       
 2      6 Aurora         
 3     11 Jackson Heights
 4     19 OSSINING       
 5     21 Monticello     
 6     22 San Diego      
 7     23 NEW HARTFORD   
 8     25 Earlville      
 9     27 Rochester      
10     28 Niagara Falls  
# ℹ 243 more rows

This data set contains some title case state abbreviations that are undetected. We can append them to our state dictionary. We can then check the unmatched variables to double-check if any states were missed.

states_append <- 
  postmastr::pm_append(
  type = "state",
  input = c(
    "Fl",
    "North Caroling",
    "Tx",
    "Ca",
    "Falls ChurchVA 22042",
    "GA 30014",
    "NV.",
    "Ky",
    "Ma",
    "CO.",
    "OaklandCA",
    "Mi",
    "cA",
    "Ny"
  ),
  output = c(
    "FL",
    "NC",
    "TX",
    "CA",
    "VA",
    "GA",
    "NV",
    "KY",
    "MA",
    "CO",
    "CA",
    "MI",
    "CA",
    "NY"
  ),
  locale = "us"
)

statedict <- 
  postmastr::pm_dictionary( type = "state", append = states_append )

pm_state_none( postmastr_citystate_prep, dictionary = statedict )
# A tibble: 242 × 2
   pm.uid pm.address     
    <int> <chr>          
 1      2 Winthrop       
 2      6 Aurora         
 3     11 Jackson Heights
 4     19 OSSINING       
 5     21 Monticello     
 6     22 San Diego      
 7     23 NEW HARTFORD   
 8     25 Earlville      
 9     27 Rochester      
10     28 Niagara Falls  
# ℹ 232 more rows

12.1.2.2 Parsing State Data

Once processing is complete, we can parse States from the Addr_CityState column.

postmastr_state_parsed <- 
  postmastr::pm_state_parse( postmastr_citystate_prep,
                             dictionary = statedict )
postmastr_state_parsed
# A tibble: 567 × 3
   pm.uid pm.address   pm.state
    <int> <chr>        <chr>   
 1      1 WINTHROP     ME      
 2      2 Winthrop     <NA>    
 3      3 Westbrook    ME      
 4      4 Port Angeles WA      
 5      5 Portland     ME      
 6      6 Aurora       <NA>    
 7      7 Bethpage     NY      
 8      8 Brooklyn     NY      
 9      9 Hauppauge    NY      
10     10 Media        PA      
# ℹ 557 more rows

12.1.2.3 Processing City Data

We repeat the same process for city data. We include a filter for the states identified after parsing with Postmastr.

states <- 
  unique( postmastr_state_parsed$pm.state ) %>%
  na.omit()

citydict <- postmastr::pm_dictionary( type = "city", filter = states )

pm_city_none( postmastr_state_parsed, dictionary = citydict )
# A tibble: 46 × 3
   pm.uid pm.address         pm.state
    <int> <chr>              <chr>   
 1     56 Saint Clair Shores <NA>    
 2     57 St Louis           MO      
 3     68 Westbrookville     <NA>    
 4     72 Wernersville       PA      
 5     73 Newfoundland       PA      
 6     77 Dillsburg          <NA>    
 7     84 Lititz             PA      
 8     92 Indianapolis       IN      
 9    100 Pacoima            <NA>    
10    108 Sonoma cA          <NA>    
# ℹ 36 more rows

After processing state names, our postmastr object contains some unmatched cities. We create an additional city-level dictionary to append to our current city-level dictionary. We coerce valid city names to NA values to let postmastr know that these inputs are valid.

CURRENT REPLACEMENT_VALUE
44 Sonoma NA
45 Williston Pk Williston Park
46 Wernersville NA
47 St Louis St. Louis
48 La Canada Flintridge NA
49 Falls Church Falls Church
city_append <- 
  postmastr::pm_append(
  type   = "city",
  input  = input.x,
  output = output.x
)

citydict <-
  postmastr::pm_dictionary( type = "city",
                            filter = states,
                            append = city_append )

pm_city_none( postmastr_state_parsed, dictionary = citydict )
# A tibble: 5 × 3
  pm.uid pm.address             pm.state
   <int> <chr>                  <chr>   
1    108 "Sonoma cA"            <NA>    
2    127 "Falls ChurchVA 22042" <NA>    
3    146 "AHMEDABAD"            <NA>    
4    265 "Tay Ho Ha Noi"        <NA>    
5    565 ""                     <NA>    

The unmatched cities in our data set are not from the US and will be ignored.

postmastr_citystate_parsed <- 
  postmastr::pm_city_parse( postmastr_state_parsed, dictionary = citydict )

citystate_parsed <- postmastr_citystate_parsed %>% 
  tidylog::left_join( postmastr_citystate_id ) %>% 
  dplyr::select( row.id, pm.city, pm.state )

12.1.3 Street Names

12.1.3.1 Prepare Data

The final column to process is Addr_StreetLocation which contains street names.

survey_addr_street <- survey_df %>% 
  dplyr::select(row.id, Addr_StreetLocation) %>% 
  dplyr::filter(! Addr_StreetLocation %in% c("-99", NA))

postmastr_street_id <- 
  postmastr::pm_identify( survey_addr_street, 
                          var = "Addr_StreetLocation" )
postmastr_street_prep <- 
  postmastr::pm_prep( postmastr_street_id, 
                      var = "Addr_StreetLocation", 
                      type = "street" )

head( postmastr_street_prep, 10 )
# A tibble: 10 × 2
   pm.uid pm.address                     
    <int> <chr>                          
 1      1 305 WINTHROP CENTER ROAD       
 2      2 188 CASE RD                    
 3      3 15 Saunders Way #500D          
 4      4 819 Georgiana St.              
 5      5 P.O. Box 761                   
 6      6 712 S. River St. STE H         
 7      7 15 Grumman Road west Suite 1000
 8      8 1000 Dean Street Suite 420     
 9      9 10 Davids Drive                
10     10 144 Longview Circle            

12.1.3.2 Parse House Numbers

Next, we extract house/unit numbers from the street addresses.

postmastr_house_parsed <- postmastr::pm_house_parse( postmastr_street_prep )
head( postmastr_house_parsed, 10 )
# A tibble: 10 × 3
   pm.uid pm.address                   pm.house
    <int> <chr>                        <chr>   
 1      1 WINTHROP CENTER ROAD         305     
 2      2 CASE RD                      188     
 3      3 Saunders Way #500D           15      
 4      4 Georgiana St.                819     
 5      5 P.O. Box 761                 <NA>    
 6      6 S. River St. STE H           712     
 7      7 Grumman Road west Suite 1000 15      
 8      8 Dean Street Suite 420        1000    
 9      9 Davids Drive                 10      
10     10 Longview Circle              144     

12.1.3.3 Parse Street Prefix and Suffix

To parse street prefixes and suffixes requires 2 dictionaries. 1 with directions and 1 with conversions for suffixes like “street” or “boulevard”.

dirs <- pm_dictionary( type = "directional", locale = "us" )

postmastr_streetdir_parsed <- 
  postmastr::pm_streetDir_parse( postmastr_house_parsed, dictionary = dirs )

postmastr_streetSuf_parsed <- 
  postmastr::pm_streetSuf_parse( postmastr_streetdir_parsed )

12.1.3.4 Parse Street Names

The final element for parsing is the street names themselves.

postmaster_street_parsed <- 
  postmastr::pm_street_parse( postmastr_streetSuf_parsed,
                              ordinal = TRUE,
                              drop = TRUE)
postmaster_street_parsed
# A tibble: 665 × 6
   pm.uid pm.house pm.preDir pm.street                    pm.streetSuf pm.sufDir
    <int> <chr>    <chr>     <chr>                        <chr>        <chr>    
 1      1 305      <NA>      Winthrop Center              Rd           <NA>     
 2      2 188      <NA>      Case                         Rd           <NA>     
 3      3 15       <NA>      Saunders Way #500d           <NA>         <NA>     
 4      4 819      <NA>      Georgiana St                 <NA>         <NA>     
 5      5 <NA>     <NA>      Po. Box 761                  <NA>         <NA>     
 6      6 712      S         River St Ste H               <NA>         <NA>     
 7      7 15       <NA>      Grumman Road West Suite 1000 <NA>         <NA>     
 8      8 1000     <NA>      Dean Street Suite 420        <NA>         <NA>     
 9      9 10       <NA>      Davids                       Dr           <NA>     
10     10 144      <NA>      Longview                     Cir          <NA>     
# ℹ 655 more rows

12.1.3.5 Combine Street Components Into Single Column

After parsing each component, we concatenate them into a single street address.

postmastr_street_full_parsed <- 
  postmastr::pm_replace( postmaster_street_parsed, 
                         source = postmastr_street_id )

endQ <- rlang::quo( !! rlang::sym("pm.sufDir")
                    )

postmastr_street_full_parsed <- 
  postmastr::pm_rebuild( postmastr_street_full_parsed, 
                         output = "short", 
                         keep_ids = TRUE )

street_parsed <- postmastr_street_full_parsed %>% 
  tidylog::left_join(postmastr_street_id) %>% 
  dplyr::select(row.id, pm.address)

12.1.4 Combine All Parsed Addresses Components Together

And finally, we concatenate our postal code, city, state and street address into a single f_address column for geocoding.

postmastr_faddress_parsed <- street_parsed %>% 
  tidylog::left_join(citystate_parsed) %>% 
  tidylog::left_join(zip_parsed) %>% 
  tidyr::unite( f_address, pm.address, 
                pm.city, pm.state, 
                pm.zip, pm.zip4, 
                na.rm = TRUE, sep = ", " )

head(postmastr_faddress_parsed, 3) %>% knitr::kable()
row.id f_address
1 305 Winthrop Center Rd, WINTHROP, ME, 04364
2 188 Case Rd, Winthrop, 04364
3 15 Saunders Way #500d, Westbrook, ME, 04092

The f_address column now has parsed addresses that can be formatted using a geocoder of choice. We can now rejoin the parsed address data via row.id from our original data set.

survey_geoproc_df <- 
  survey_df %>% 
  tidylog::left_join( postmastr_faddress_parsed, by = "row.id" )