set trace off local year1 1920 local year2 1930 local years `year1' `year2' // Check mac or windows capture confirm file "/Users/miguel/Documents/Research/Stata/Census of Population/Docs/Census Variable names.xls" // _rc is non-zero if there is an error if (~_rc) { local mac 1 local the_folder "/Volumes/mabm3/CoP" local slash "/" global log_entries 0 run "/Users/miguel/Documents/Research/Stata/personal programs.do" run "/Volumes/mabm3/CoP/Stata/personal programs (faculty drive).do" local reload 0 } else { local mac 0 local the_folder "C:\Users\mabm3\Documents\umnData" local slash "\" local reload 1 run "N:\mabm3\CoP\Stata\personal programs (faculty drive).do" } local temp_folder "`the_folder'`slash'temp" local stata_folder "`the_folder'`slash'Stata" local output_folder "`the_folder'`slash'linked" local freq_file "`stata_folder'`slash'freq_occ_county" /////////////////////////////////////////////////// // Treat files from each year into a Stata dataset /////////////////////////////////////////////////// if (`mac') { global birthstates Alaska Arizona } else { global birthstates Alabama Alaska Arizona Arkansas California Colorado Connecticut Delaware District_of_Columbia Florida Georgia Hawaii Idaho Illinois Indiana Iowa Kansas Kentucky Louisiana Maine Maryland Massachusetts Michigan Minnesota Mississippi Missouri Montana Nebraska Nevada New_Hampshire New_Jersey New_Mexico New_York North_Carolina North_Dakota Ohio Oklahoma Oregon other Pennsylvania Rhode_Island South_Carolina South_Dakota Tennessee Texas Utah Vermont Virginia Washington West_Virginia Wisconsin Wyoming // global birthstates Alabama Alaska Arizona // Oregon Pennsylvania Rhode_Island South_Carolina South_Dakota Tennessee Texas Utah Vermont Virginia Washington West_Virginia Wisconsin Wyoming other } /////////////////////////////////////////////////// // Load datasets and save as Stata datasets /////////////////////////////////////////////////// if (`reload') { foreach birthstate in $birthstates { local num_years `: word count `years'' forvalues i = 1 / `num_years' { local year `: word `i' of `years'' insheet using "`the_folder'`slash'`year'_extracted`slash'`year'_`birthstate'.txt", delimiter("|") clear // Rename if (`year' == 1920) { rename self_empty_name_surname last_name rename self_empty_name_given first_name rename self_birth_place_empty birthplace rename self_residence_place_state state rename self_residence_place_county county rename self_residence_info_age age rename indexed_occupation occupation rename indexed_industry industry } else if (`year' == 1930) { rename self_empty_name_surname last_name rename self_empty_name_given first_name rename self_birth_place_empty birthplace rename general_stateabb state_code rename self_residence_place_county county rename self_residence_info_age age rename self_empty_info_occupation occupation rename general_industry industry rename general_employment employment } save "`stata_folder'`slash'`year'_`birthstate'", replace } } } /////////////////////////////////////////////////// // Iterate on birth states to avoid memory // burden in Stata, then treat each state: // frequency table by county of residence and occupation /////////////////////////////////////////////////// local num_years `: word count `years'' forvalues i = 1 / `num_years' { local year `: word `i' of `years'' local virgin 1 foreach birthstate in $birthstates { use "`stata_folder'`slash'`year'_`birthstate'", replace gen num = 1 collapse (sum) num, by (occupation state county) gen year = `year' if (~`virgin') append using "`freq_file'`year'" local virgin 0 save "`freq_file'`year'", replace } if (`year' == 1920) { add_state_details // Panama Canal, military, etc. are left out drop state save "`freq_file'`year'", replace } } // Collapse again, now that I have all birthstates use "`freq_file'1920", clear append using "`freq_file'1930" collapse (sum) num, by(occupation state_code county year) // split occupations by electricity drop if mi(occupation) gen occ_elec = regexm(lower(occupation), "elec") // collapse again by electricity collapse (sum) num, by (occ_elec state_code county year) // Reshape by occupation reshape wide num, i(state_code county year) j(occ_elec) replace num1 = 0 if mi(num1) replace num0 = 0 if mi(num0) gen elec_adoption = 100 * num1 / (num1 + num0) drop num0 num1 reshape wide elec_adoption, i(state_code county) j(year) save "`stata_folder'`slash'elec_adoption", replace /////////////////////////////////////////////////// // Iterate on birth states to avoid memory // burden in Stata, then treat each state: // linking /////////////////////////////////////////////////// foreach birthstate in $birthstates { local output_file "`output_folder'`slash'linked_`birthstate'" local num_years `: word count `years'' forvalues i = 1 / `num_years' { local year `: word `i' of `years'' use "`stata_folder'`slash'`year'_`birthstate'", replace // Extract first name, without middle initial replace first_name = regexr(first_name, " .*$", "") // Generate soundex gen first_name_soundex = soundex(first_name) gen last_name_soundex = soundex(last_name) // Treat age local age_type: type age if (regexm("`age_type'", "^str")) destring age, replace force gen birthyear = `year' - age drop age // Treat birthplace to lower case replace birthplace = lower(birthplace) if ("`birthstate'" == "other") { drop if birthplace == "united states" | birthplace == "" | birthplace == "??" } // Generate unique ID capture drop pid capture drop general_unique_identifier gen id = _n // Sort sort birthplace first_name_soundex last_name_soundex birthyear // Discard multiples duplicates drop birthplace first_name_soundex last_name_soundex birthyear, force // rename other variables foreach v in id first_name last_name occupation industry birthyear { rename `v' `v'`year' } if (`year' == 1930) rename employment employment1930 // Save local file`i' "`temp_folder'`slash'`year'_sorted" save "`file`i''", replace } /////////////////////////////////////////////////// // Merge! /////////////////////////////////////////////////// local id_vars birthplace first_name_soundex last_name_soundex /////////////////////////////////////////////////// //ÊMethod 2: data-intensive, easier to code /////////////////////////////////////////////////// use "`file1'", clear merge m:m `id_vars' using "`file2'" drop _merge // Age discordance gen age_disc = birthyear`year2' - birthyear`year1' save "`temp_folder'`slash'merge", replace // Increment tolerance foreach m in 0 1 2 { use "`temp_folder'`slash'merge", clear // Consider a match if the age falls below the treshold gen matched = (abs(age_disc) <= `m') // Define whether a person in either year was matched bysort id`year1': egen matched_total1 = max(matched) bysort id`year2': egen matched_total2 = max(matched) // We will discard all multiple matches, going in either direction // and consider only unique matches replace matched = max(matched_total1, matched_total2) // Clean up drop matched_total* // Split the dataset by matched and non-matched save "`temp_folder'`slash'merge", replace keep if matched duplicates tag id`year1', gen(dup1) duplicates tag id`year2', gen(dup2) // Drop multiple matches drop if dup1 | dup2 // Clean up drop dup1 dup2 matched // Save or append unique matches if (`m' == 0) { save "`output_file'", replace } else { append using "`output_file'" save "`output_file'", replace } // Recover unmatched people and increase the tolerance use "`temp_folder'`slash'merge", clear keep if ~matched drop matched duplicates drop id`year1', force duplicates drop id`year2', force save "`temp_folder'`slash'merge", replace } } // Compile all the linked datasets together ... local virgin 1 foreach birthstate in $birthstates { local source_file "`output_folder'`slash'linked_`birthstate'" if (`virgin') use "`source_file'", clear else append using "`source_file'" } save "`stata_folder'`slash'linked", replace /* /////////////////////////////////////////////////// // Method 1: sparse, computational /////////////////////////////////////////////////// // First pass: exact on year, then +/- 1, then +/- 2 use "`file1'", clear merge 1:1 `id_vars' using "`file2'" // Save temporarily save "`temp_folder'merge0", replace // Keep good matches for saving keep if _merge == 3 save "`output_file'", replace // Keep unmatched ones in each year // Unmatched in master use "`temp_folder'merge0", clear keep if _merge == 1 drop *`year2' save "`temp_folder'unmatched1", replace // Unmatched in using use "`temp_folder'merge0", clear keep if _merge == 2 drop *`year1' save "`temp_folder'unmatched2", replace // Now merge with tolerance on birthyear foreach m in 1 2 { foreach side in 1 -1 { use "`temp_folder'unmatched1" replace birthyear = birthyear + `side' * `m' merge 1:1 `id_vars' using "temp_folder'unmatched2" save "`temp_folder'temp_matches", replace } else { append using "`output_file'" save "`output_file'", replace } // Deal with unmatched ones } // Second pass: one year margin on birthplace */