* Note: oiginal file by Katherine, additional methods added by Helen 

/* This do-file constructs four alternative versions of the foreignness index of white male first names in the 1920 census. 
Original : finds relative frequency first names of foreign-born vs. native-born seperately for each birth cohort
NYSIIS : finds relative frequency of NYSIIS first names of foreign-born vs. native-born seperately for each birth cohort
"dumb" : finds relative frequency of first names of foreign-born vs. native-born for all men born between 1895-1905 - used to replicate method used in modern data
Second-gen/mbpl : finds relative frequency of first names of second gen (foreign mother) vs. native (native morther) for each birth cohort
*/
clear all
set more off
global MatchingDoFiles /disk/homedirs/nber/keriksso/MatchingFiles
global outdir /disk/bulkw/keriksso/JMPMatch
global datadir /homes/data/cens1930.work/keriksso/keriksso/Clean_residence
*global Kbulk  /disk/bulkw/keriksso
global Kbulk "/disk/bulkw/hkissel/cens1930.work/Cultural_Assim_data"

** we keep all states outside of south: 
/* Since we use only non-southern states in analysis, we only calculate F-index using those who reside outside the south: 
   ALL states: 1 2 4 5 6  8 9 10 11 12 13 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 44 45 46 47 48 49 50 51 53 54 55 56
   Southern states: 1 (Alabama),5 (Arkansas), 10 (Delaware), 11 (DC), 12 (Florida), 13 (Georgia), 21 (Kentucky),22 (Louisiana), 24 (Maryland),28 (Mississippi), 37 (North Carolina),40 (Oklahoma), 45 (South Carolina),   47 (Tennessee),    48 (Texas), 51 (Virginia), 54 (West Virginia)
   States to use in analysis (drop south & Alaska & Hawaii): 4 6 8 9 16 17 18 19 20 23 25 26 27 29 30 31 32 33 34 35 36 38 39 41 42 44 46 49 50 53 54 55 56
 */

local states "4 6 8 9 16 17 18 19 20 23 25 26 27 29 30 31 32 33 34 35 36 38 39 41 42 44 46 49 50 53 55 56"

*** SET GENDER YOU ARE USING HERE: *** 
** if boys, local gender = "", if girls local gender = "_girls"
** local gender _girls
** local gender ""


di "******** `gender' only **** "
/* -------------------------------------------------------------------------------------------
* Setup: Find the frequency of each name-age-bpl combination in the 1920 census:
------------------------------------------------------------------------------------------- */
	foreach s in `states'{
	use $datadir//`s'_clean_20

	* Keep only white men (or women):
		if "`gender'" == "" {
		keep if sex==1
		}
		if "`gender'" == "_girls" {
		keep if sex==2
		}
		tab sex
		keep if race==100

	* clean first names:
		split namefrst
		replace namefrst = upper(namefrst1)
		replace namefrst = upper(namefrst2) if namefrst1 == "?"
		capture drop namefrst1 namefrst2 namefrst3 namefrst4 namefrst5
		gen firstname = namefrst

	* Find counts of each name-age-bpl combination
		preserve
		contract firstname age bpl
		save $Kbulk//`s'_names`gender', replace
		restore 

	* Find counts of each name-age-mbpl combination 
		contract firstname age mbpl
		save $Kbulk//`s'_names_mbpl`gender', replace
	}
	cap clear

	foreach s in `states'{
	append using $Kbulk//`s'_names`gender'
	}
	save $Kbulk//namescount1920`gender'.dta, replace
	
	clear
	foreach s in `states'{
	append using $Kbulk//`s'_names_mbpl`gender'
	}
	save $Kbulk//namescount1920_mbpl`gender'.dta, replace
	
	clear


	
/* -------------------------------------------------------------------------------------------
* Cohort-specific ("original") FB index - by name, age, bpl 
------------------------------------------------------------------------------------------- */
use $Kbulk//namescount1920`gender'.dta

/*want a total count in the data by first name age, birth place*/
collapse (sum) _freq, by(firstname  age bpl)
rename _freq freq

/* Define foreign-born: birth_place=56 is Wyoming (the last US State)*/
gen fb = bpl>10000

/*collapse down to fb instead of all by birthplace*/
collapse (sum) freq, by(firstname fb age)

sort firstname age fb

bys firstname: egen ttf = sum(freq)
sum ttf, d

drop if ttf<100

reshape wide freq, i(firstname age) j(fb)

rename freq0 freqUS
rename freq1 freqFB

gen birthyear = 1920 - age

forvalues i = 1850/1920{

gen xUS = freqUS if birthyear<=`i' & birthyear>=`i'-20
gen xFB = freqFB if birthyear<=`i' & birthyear>=`i'-20

bys firstname: egen fUS = sum(xUS)
bys firstname: egen fFB = sum(xFB)

egen nFB = sum(xFB)
egen nUS = sum(xUS)


gen pUS`i' = fUS/nUS 
gen pFB`i' = fFB/nFB

gen FBindex`i' = pFB`i'/(pUS`i' + pFB`i')
drop xUS xFB fUS fFB nUS nFB

}

gen FBindex=.
forvalues i=1850/1920{
replace FBindex = FBindex`i' if birthyear==`i'
}

forvalues i=1850/1920{
drop FBindex`i'
drop pUS`i'
drop pFB`i'
}

keep if birthyear>=1850


save $Kbulk/US1920_FBIndex`gender', replace
clear


	
	
/* -------------------------------------------------------------------------------------------
* "Dumb" FB index (to match modern data) - by name and bpl for all men in cohort: 1895-1905 
------------------------------------------------------------------------------------------- */
	
use $Kbulk//namescount1920`gender'.dta

* keep only birth years 1895-1905 
gen birthyear = 1920 - age
keep if birthyear >= 1895 & birthyear <= 1905 

* collapse by first name and birth place (NOT age)
collapse (sum) _freq, by(firstname bpl)
rename _freq freq

/* Define foreign-born: birth_place=56 is Wyoming (the last US State)*/
gen fb = bpl>10000

/*collapse down to fb instead of all by birthplace*/
collapse (sum) freq, by(firstname fb)

sort firstname fb

bys firstname: egen ttf = sum(freq)
sum ttf, d

drop if ttf<100

reshape wide freq, i(firstname) j(fb)

rename freq0 freqUS
rename freq1 freqFB

gen xUS = freqUS 
gen xFB = freqFB 

bys firstname: egen fUS = sum(xUS)
bys firstname: egen fFB = sum(xFB)

egen nFB = sum(xFB)
egen nUS = sum(xUS)


gen pUS = fUS/nUS 
gen pFB = fFB/nFB

gen FBindex = pFB/(pUS + pFB)
drop xUS xFB fUS fFB nUS nFB

save $Kbulk/US1920_FBIndex_dumb`gender', replace
clear
	
	
/* -------------------------------------------------------------------------------------------
* Country-specific F index ("dumb" index, but seperately by each of the 16 countries of origin)
instead of calculating #foreign/ #native, find #german/#not-german... 
------------------------------------------------------------------------------------------- */
local bpls "404 405 410 414 434 450 453 465 426 421 400 412 411 401 436 420"

use $Kbulk//namescount1920`gender'.dta, clear

* keep only birth years 1895-1905 
gen birthyear = 1920 - age
keep if birthyear >= 1895 & birthyear <= 1905 

** 

* collapse by first name and birth place (NOT age)
collapse (sum) _freq, by(firstname bpl)
rename _freq freq

* define groups 
gen group = 0
foreach b in `bpls'{
replace group =  `b' if bpl == `b'*100
}

/*collapse down to group instead of all by birthplace*/
collapse (sum) freq, by(firstname group)

* keep only names w/ at least 100 obs. 
bys firstname: egen ttf = sum(freq)
sum ttf, d

drop if ttf<100

reshape wide freq, i(firstname) j(group)

* Find total count for each name: 
foreach x in 0 `bpls'{
replace freq`x' = 0 if freq`x' == . 
}

gen freqtotal = freq0 + freq404 + freq405 + freq410 + freq414 + freq434 + freq450 + freq453 + freq465 + freq426 + freq421 + freq400 + freq412 + freq411 + freq401 + freq436 + freq420
foreach b in `bpls'{

gen freq_not`b' = freqtotal - freq`b'

bys firstname: egen fnot`b' = sum(freq_not`b')
bys firstname: egen f`b' = sum(freq`b')

egen n`b' = sum(freq`b')
egen nnot`b' = sum(freq_not`b')


gen p`b' = f`b'/n`b' 
gen pnot`b' = fnot`b'/nnot`b'

gen FBindex_`b' = p`b'/(pnot`b' + p`b')
}

save $Kbulk/US1920_FBIndex_dumb_bybpl`gender', replace




/* -------------------------------------------------------------------------------------------
* Adjust names with NYSIIS 
------------------------------------------------------------------------------------------- */
*use $Kbulk//namescount1920`gender'.dta, clear

*nysiis(firstname), gen(nysiis_first)
*save $Kbulk//namescount1920`gender'_withnysiis.dta, replace

*save mapping between nysiis & firstnames (so I don't have to re-find all the nysiis names...) 

*use $Kbulk//namescount1920`gender'_withnysiis.dta, clear
* duplicates drop nysiis_first firstname, force
 *keep nysiis_first firstname
 *save $Kbulk//nysiis_names`gender', replace

use $Kbulk//namescount1920`gender'_withnysiis.dta, clear

* keep only birth years 1895-1905 
gen birthyear = 1920 - age
keep if birthyear >= 1895 & birthyear <= 1905 

* collapse by first name and birth place (NOT age)
collapse (sum) _freq, by(nysiis_first bpl)
rename _freq freq

/* Define foreign-born: birth_place=56 is Wyoming (the last US State)*/
gen fb = bpl>10000

/*collapse down to fb instead of all by birthplace*/
collapse (sum) freq, by(nysiis_first fb)

sort nysiis_first fb

bys nysiis_first: egen ttf = sum(freq)
sum ttf, d

drop if ttf<100

reshape wide freq, i(nysiis_first) j(fb)

rename freq0 freqUS
rename freq1 freqFB

gen xUS = freqUS 
gen xFB = freqFB 

bys nysiis_first: egen fUS = sum(xUS)
bys nysiis_first: egen fFB = sum(xFB)

egen nFB = sum(xFB)
egen nUS = sum(xUS)


gen pUS = fUS/nUS 
gen pFB = fFB/nFB

gen FBindex = pFB/(pUS + pFB)
drop xUS xFB fUS fFB nUS nFB
rename FBindex FB_dumb_nysiis

save $Kbulk/US1920_FBIndex_nysiis_dumb`gender', replace


clear
	

/* -------------------------------------------------------------------------------------------
* By state of birth
------------------------------------------------------------------------------------------- */
 local states "4 6 8 9 16 17 18 19 20 23 25 26 27 29 30 31 32 33 34 35 36 38 39 41 42 44 46 49 50 53 55 56"
foreach s in `states'{
use $Kbulk//`s'_names`gender', clear
* keep only birth years 1895-1905 
gen birthyear = 1920 - age
keep if birthyear >= 1895 & birthyear <= 1905 

* collapse by first name and birth place (NOT age)
collapse (sum) _freq, by(firstname bpl)
rename _freq freq

* Define foreign-born: birth_place=56 is Wyoming (the last US State)
gen fb = bpl>10000

*collapse down to fb instead of all by birthplace
collapse (sum) freq, by(firstname fb)

sort firstname fb

bys firstname: egen ttf = sum(freq)
sum ttf, d

drop if ttf<100

reshape wide freq, i(firstname) j(fb)

rename freq0 freqUS
rename freq1 freqFB

gen xUS = freqUS 
gen xFB = freqFB 

bys firstname: egen fUS = sum(xUS)
bys firstname: egen fFB = sum(xFB)

egen nFB = sum(xFB)
egen nUS = sum(xUS)


gen pUS = fUS/nUS 
gen pFB = fFB/nFB

gen FBindex = pFB/(pUS + pFB)
drop xUS xFB fUS fFB nUS nFB

gen statefip = `s'

tempfile state`s'
save `state`s''
}	
	
clear
foreach s in `states'{
append using `state`s''
}

save $Kbulk/US1920_FBIndex_dumb_bystate`gender', replace


/* -------------------------------------------------------------------------------------------
* Second-gen FB index - by name, age, mbpl 
------------------------------------------------------------------------------------------- */
use $Kbulk//namescount1920_mbpl`gender'.dta, clear
drop if mbpl == 99999

/*want a total count in the data by first name age, birth place*/
collapse (sum) _freq, by(firstname  age mbpl)
rename _freq freq

/* Define foreign-born based on *mothers* place of birth: birth_place=56 is Wyoming (the last US State)*/
gen fb = mbpl>10000

/*collapse down to fb instead of all by birthplace*/
collapse (sum) freq, by(firstname fb age)

sort firstname age fb

bys firstname: egen ttf = sum(freq)
sum ttf, d

drop if ttf<100

reshape wide freq, i(firstname age) j(fb)

rename freq0 freqUS
rename freq1 freqFB

gen birthyear = 1920 - age

forvalues i = 1850/1920{

gen xUS = freqUS if birthyear<=`i' & birthyear>=`i'-20
gen xFB = freqFB if birthyear<=`i' & birthyear>=`i'-20

bys firstname: egen fUS = sum(xUS)
bys firstname: egen fFB = sum(xFB)

egen nFB = sum(xFB)
egen nUS = sum(xUS)


gen pUS`i' = fUS/nUS 
gen pFB`i' = fFB/nFB

gen FBindex`i' = pFB`i'/(pUS`i' + pFB`i')
drop xUS xFB fUS fFB nUS nFB

}

gen FBindex=.
forvalues i=1850/1920{
replace FBindex = FBindex`i' if birthyear==`i'
}

forvalues i=1850/1920{
drop FBindex`i'
drop pUS`i'
drop pFB`i'
}

keep if birthyear>=1850

drop if firstname == ""
save $Kbulk/US1920_FBIndex_mbpl`gender', replace
clear



/* -------------------------------------------------------------------------------------------
* NYSIIS FB index - by nysiis name, age, bpl 
------------------------------------------------------------------------------------------- */

use $Kbulk/US1920_FBIndex
gen string_to_mod = firstname
qui do "$MatchingDoFiles/stringmod_first_full.do"
rename nysiis_out nysiis_first

collapse (sum) freqUS freqFB, by(nysiis_first age)

gen  birthyear = 1920 - age 
forvalues i = 1850/1920{

gen xUS = freqUS if birthyear<=`i' & birthyear>=`i'-20
gen xFB = freqFB if birthyear<=`i' & birthyear>=`i'-20

bys nysiis_first: egen fUS = sum(xUS)
bys nysiis_first: egen fFB = sum(xFB)

egen nFB = sum(xFB)
egen nUS = sum(xUS)


gen pUS`i'_N = fUS/nUS 
gen pFB`i'_N = fFB/nFB

gen FBindex`i'_N = pFB`i'_N/(pUS`i'_N + pFB`i'_N)
drop xUS xFB fUS fFB nUS nFB

}
gen FBindex_N=.
forvalues i=1850/1920{
replace FBindex_N = FBindex`i'_N if birthyear==`i'
}

keep if birthyear>=1850

forvalues i=1850/1920{
drop FBindex`i'_N
drop pUS`i'_N
drop pFB`i'_N
}

save $Kbulk/US1920_FBIndex_N, replace


	
	
/* -------------------------------------------------------------------------------------------
* "Dumb" FB index using ONLY PARENTS
------------------------------------------------------------------------------------------- */
	
	/* need to re-count the names keeping in only those w/ parents */

	foreach gender in "" _girls{

	local states "4 6 8 9 16 17 18 19 20 23 25 26 27 29 30 31 32 33 34 35 36 38 39 41 42 44 46 49 50 53 55 56"

	di "******** `gender' only **** "

			foreach s in `states'{
			use $datadir//`s'_clean_20
				* keep only parents
				keep if nchild > 0 
				
			* Keep only white men (or women):
				if "`gender'" == "" {
				keep if sex==1
				}
				if "`gender'" == "_girls" {
				keep if sex==2
				}
				tab sex
				keep if race==100

			* clean first names:
				split namefrst
				replace namefrst = upper(namefrst1)
				replace namefrst = upper(namefrst2) if namefrst1 == "?"
				capture drop namefrst1 namefrst2 namefrst3 namefrst4 namefrst5
				gen firstname = namefrst

			* Find counts of each name-age-bpl combination
				contract firstname age bpl
				save $Kbulk//`s'_names_parentsonly`gender', replace
			}
			
			cap clear

			foreach s in `states'{
			append using $Kbulk//`s'_names_parentsonly`gender'
			}
			save $Kbulk//namescount1920_parentsonly`gender'.dta, replace
			
		


use $Kbulk//namescount1920_parentsonly`gender'.dta

* keep only birth years 1895-1905 
gen birthyear = 1920 - age
keep if birthyear >= 1895 & birthyear <= 1905 

* collapse by first name and birth place (NOT age)
collapse (sum) _freq, by(firstname bpl)
rename _freq freq

/* Define foreign-born: birth_place=56 is Wyoming (the last US State)*/
gen fb = bpl>10000

/*collapse down to fb instead of all by birthplace*/
collapse (sum) freq, by(firstname fb)

sort firstname fb

bys firstname: egen ttf = sum(freq)
sum ttf, d

drop if ttf<100

reshape wide freq, i(firstname) j(fb)

rename freq0 freqUS
rename freq1 freqFB

gen xUS = freqUS 
gen xFB = freqFB 

bys firstname: egen fUS = sum(xUS)
bys firstname: egen fFB = sum(xFB)

egen nFB = sum(xFB)
egen nUS = sum(xUS)


gen pUS = fUS/nUS 
gen pFB = fFB/nFB

gen FBindex = pFB/(pUS + pFB)
drop xUS xFB fUS fFB nUS nFB

save $Kbulk/US1920_FBIndex_dumb_parentsonly`gender', replace
clear
	
	
	
}
