/************************************

	Code for Abramitzky, Boustan, Eriksson (2019)
		"Do Immigrants Assimilate More Slowly Today than in the Past"
		American Economic Review: Insights
		
		This do-file reads in birth certificate data from California:
			1989-2015, !=2011

*************************************/

set more off
clear
clear mata
clear matrix
set maxvar 10000
set mata 1000
set rmsg on
cap log close


******************************************************************************

* set directories
global main ""
global data "$main/1data"
global dict "$main/3work/Dictionaries"
global proc_data "$main/3work/ImportedYears"


******************************************************************************


	* For each year file, read it in with the corresponding dictionary and then save it as an individual file in the 
	* "ImportedYears" folder. Hard save these files due to processing time/power may not hold all of the requisite tempfiles at once
	forvalues year = 1980(1)2015{
		
		* set folder and data names conditional on the year being processed
		if `year'>=1980 & `year'<=1989{
			local dat_name "$data/1980-1989/`year'.txt"
			
			* nest dictionaries here
			if `year'>=1980 & `year'<=1981{
				local dct_name "$dict/1980_1981.dct"
			}	
			
			if `year'>=1982 & `year'<=1984{
				local dct_name "$dict/1982_1984.dct"
			}
			
			if `year'>=1985 & `year'<=1988{
				local dct_name "$dict/1985_1988.dct"
			}
			
			if `year'>=1989 & `year'<=1989{
				local dct_name "$dict/1989_1996.dct"
			}
			
		}
		
		
		if `year'>=1990 & `year'<=1999{
			local dat_name "$data/1990-1999/`year'.txt"
			
			* nest dictionaries here
			if `year'>=1990 & `year'<=1996{
				local dct_name "$dict/1989_1996.dct"
			}
			
			if `year'>=1997 & `year'<=1999{
				local dct_name "$dict/1997_1999.dct"
			}
			
		}
		
	
		if `year'>=2000 & `year'<=2009{
			local dat_name "$data/2000-2009/`year'.txt"
			
			* nest dictionaries here
			if `year'>=2000 & `year'<=2002{
				local dct_name "$dict/2000_2002.dct"
			}
			
			if `year'>=2003 & `year'<=2004{
				local dct_name "$dict/2003_2004.dct"
			}
			
			if `year'>=2005 & `year'<=2005{
				local dct_name "$dict/2005.dct"
			}
			
			if `year'>=2006 & `year'<=2006{	
				local dct_name "$dict/2006.dct"
			}	
			
			if `year'>=2007 & `year'<=2007{	
				local dct_name "$dict/2007.dct"
			}
			
			if `year'>=2008 & `year'<=2008{
				local dct_name "$dict/2008.dct"
			}	
			
			if `year'>=2009 & `year'<=2009{
				local dct_name "$dict/2009.dct"
			}
			
		}
		
		if `year'>=2010 & `year'<=2015{
			local dat_name "$data/2010-2015/`year'.txt"
			
			if `year'>=2010 & `year'<=2010{
				local dct_name "$dict/2010.dct"
			}
			
			if `year'>=2011 & `year'<=2011{
				local dct_name "$dict/2011.dct"
			}
			
			if `year'>=2012 & `year'<=2012{
				local dct_name "$dict/2012.dct"
			}
			
			if `year'>=2013 & `year'<=2013{
				local dct_name "$dict/2013.dct"
			}
			
			if `year'>=2014 & `year'<=2015{
				local dct_name "$dict/2014_2015.dct"
			}
			
		}
	
		di "Processing: `year' from `dat_name'"
	
		qui infile using "`dct_name'", using("`dat_name'") clear
		qui gen data_year = `year'
		
			* cleaning steps from 1make_CAcertificate_FULL.do
			gen birthyear=`year'
				
		foreach var in lastname flastworked mlastworked feduc mmulti fmulti mbirthplace_detail flastname mresidence ID moccupation fage fethnicity foccupation deathdate birthsaliveliving birthsalivedeceased totalbornalive datelastlivebirth fhispcode mhispcode mresidencestate mzipcode middlename birthweight mhospcode birthplace frace frace_ethnicity race ethnicity{
			cap drop `var'
		}	
		
			if `year'>=1989 & `year'<=2010{
				gen temp=mbirthdate
				gen mbirthyear=""
				replace mbirthyear="19"+substr(temp, 2, 2) if substr(temp, 1, 1)=="9"
				replace mbirthyear="20"+substr(temp, 2, 2) if substr(temp, 1, 1)=="0"
				replace temp=substr(temp, 4, 4)
				replace temp=mbirthyear+temp
				destring mbirthyear, replace
				replace mbirthdate=temp
				drop temp
				
				gen temp=fbirthdate
				gen fbirthyear=""
				replace fbirthyear="19"+substr(temp, 2, 2) if substr(temp, 1, 1)=="9"
				replace fbirthyear="20"+substr(temp, 2, 2) if substr(temp, 1, 1)=="0"
				replace temp=substr(temp, 4, 4)
				replace temp=fbirthyear+temp
				destring fbirthyear, replace
				replace fbirthdate=temp
				drop temp
			
			}
			if `year'>2010{
				gen mbirthyear = substr(mbirthdate,1,4)
				destring mbirthyear, replace
				
				gen fbirthyear = substr(fbirthdate,1,4)
				destring fbirthyear, replace
			}
	
	
		qui compress
		qui save "$proc_data/`year'.dta", replace
	
	}
	
	clear
	forvalues year = 1989(1)2015{
		di "Appending: `year'"
		qui append using "$proc_data/`year'.dta"
		qui compress
	}
	drop data_year

	
	
		*** now clean the appended data
		
		
		* NOTE: because of computational constraints, we perform the cleaning in multiple steps
		
		* 1) clean mrace
		* move mrace into mrace_methnicity
			replace mrace_methnicity = mrace if mi(mrace_methnicity) & !mi(mrace)
			replace mrace=10 if mrace==1
			replace mrace=20 if mrace==2		
		
			drop birthorder
			drop fbirthdate
			#delimit ;
			gen mnativity=(mbirthplace!="PR" & mbirthplace!="VI" & mbirthplace!="GU" & mbirthplace!="CH" &
			mbirthplace!="JA" & mbirthplace!="VN" & mbirthplace!="PI" & mbirthplace!="CN" & mbirthplace!="CU"
			&mbirthplace!="MX" & mbirthplace!="RE" & mbirthplace!="NA" & mbirthplace!="");
			#delimit cr

			
			compress
			destring mbirthplace, g(temp) force
			replace mnativity=0 if temp>=. & birthyear>=2009
			gen mbirthplace_detail=mbirthplace if birthyear>=2009
			replace mbirthplace="VN" if mbirthplace=="VM"  &birthyear>=2009
			replace mbirthplace="TW" if mbirthplace=="CH"  &birthyear>=2009
			replace mbirthplace="CH" if mbirthplace=="CC"  &birthyear>=2009
			replace mbirthplace="PI" if mbirthplace=="PH"  &birthyear>=2009
			replace mbirthplace="NA" if mbirthplace=="XX"  &birthyear>=2009
			#delimit ;
			replace mbirthplace="RE" if (mbirthplace!="PR"&mbirthplace!="VI"&mbirthplace!="GU"
			&mbirthplace!="CH"&mbirthplace!="JA"&mbirthplace!="VN"&mbirthplace!="PI"&mbirthplace!="CN"
			&mbirthplace!="CU"&mbirthplace!="MX"&mbirthplace!="NA"&mbirthplace!="") &birthyear>=2009 & temp>=.;
			#delimit cr
			
			

			replace mbirthplace="AL" if mbirthplace=="101"
			replace mbirthplace="AK" if mbirthplace=="102" 
			replace mbirthplace="AZ" if mbirthplace=="103"
			replace mbirthplace="AR" if mbirthplace=="104" 
			replace mbirthplace="CA" if mbirthplace=="105" 
			replace mbirthplace="CO" if mbirthplace=="106"
			replace mbirthplace="CT" if mbirthplace=="107"
			replace mbirthplace="DE" if mbirthplace=="108"
			replace mbirthplace="DC" if mbirthplace=="109"
			replace mbirthplace="FL" if mbirthplace=="110" 
			replace mbirthplace="GA" if mbirthplace=="111" 
			replace mbirthplace="HI" if mbirthplace=="112" 
			replace mbirthplace="ID" if mbirthplace=="113" 
			replace mbirthplace="IL" if mbirthplace=="114"
			replace mbirthplace="IN" if mbirthplace=="115" 
			replace mbirthplace="IA" if mbirthplace=="116" 
			replace mbirthplace="KS" if mbirthplace=="117"
			replace mbirthplace="KY" if mbirthplace=="118"
			replace mbirthplace="LA" if mbirthplace=="119"
			replace mbirthplace="ME" if mbirthplace=="120"
			replace mbirthplace="MD" if mbirthplace=="121"
			replace mbirthplace="MA" if mbirthplace=="122" 
			replace mbirthplace="MI" if mbirthplace=="123" 
			replace mbirthplace="MN" if mbirthplace=="124" 
			replace mbirthplace="MS" if mbirthplace=="125"
			replace mbirthplace="MO" if mbirthplace=="126"
			replace mbirthplace="MT" if mbirthplace=="127"
			replace mbirthplace="NB" if mbirthplace=="128"
			replace mbirthplace="NV" if mbirthplace=="129"
			replace mbirthplace="NH" if mbirthplace=="130"
			replace mbirthplace="NJ" if mbirthplace=="131"
			replace mbirthplace="NM" if mbirthplace=="132"
			replace mbirthplace="NY" if mbirthplace=="133"
			replace mbirthplace="NC" if mbirthplace=="134"
			replace mbirthplace="ND" if mbirthplace=="135"
			replace mbirthplace="OH" if mbirthplace=="136"
			replace mbirthplace="OK" if mbirthplace=="137" 
			replace mbirthplace="OR" if mbirthplace=="138" 
			replace mbirthplace="PA" if mbirthplace=="139" 
			replace mbirthplace="RI" if mbirthplace=="140"
			replace mbirthplace="SC" if mbirthplace=="141"
			replace mbirthplace="SD" if mbirthplace=="142"
			replace mbirthplace="TN" if mbirthplace=="143"
			replace mbirthplace="TX" if mbirthplace=="144"
			replace mbirthplace="UT" if mbirthplace=="145"
			replace mbirthplace="VT" if mbirthplace=="146"
			replace mbirthplace="VA" if mbirthplace=="147"
			replace mbirthplace="WA" if mbirthplace=="148"
			replace mbirthplace="WV" if mbirthplace=="149"
			replace mbirthplace="WI" if mbirthplace=="150"
			replace mbirthplace="WY" if mbirthplace=="151"
			drop mbirthplace_detail
			

			replace mbirthyear=birthyear-mage if mbirthyear==.
			
			* recode education to be consistent across years
			replace meduc=16 if (meduc==6|meduc==7|meduc==8)&birthyear>=2006
			replace meduc=99 if meduc==9&birthyear>=2006
			replace meduc=8 if meduc==1&birthyear>=2006

			replace meduc=12 if meduc==3&birthyear>=2006
			replace meduc=13 if meduc==4&birthyear>=2006
			replace meduc=13 if meduc==5&birthyear>=2006
			replace meduc=9 if meduc==2&birthyear>=2006
	
			save "$main/3work/raw_data_appended.dta", replace
	
	
	
	
	
********************************
* Data cleaning steps
	
	use "$main/3work/raw_data_appended.dta", replace
	
	* Male index
	preserve
	drop if birthyear>=1985&birthyear<=1988
	drop if birthyear==2011
		keep if sex==1
		gen freq=1
		gen fb = 1-mnativity
		drop if fb==.
		drop if firstname==""

		collapse (sum) freq, by(firstname  birthyear fb)

		sort firstname birthyear fb

		bys firstname: egen ttf = sum(freq)
		sum ttf, d

		drop if ttf<100

		reshape wide freq, i(firstname birthyear) j(fb)

		rename freq0 freqUS
		rename freq1 freqFB

		forvalues i = 1980/2015{

		gen xUS = freqUS if birthyear<=`i'
		gen xFB = freqFB if birthyear<=`i'

		bys firstname: egen fUS = sum(xUS)
		bys firstname: egen fFB = sum(xFB)

		egen nFB = sum(xFB)
		egen nUS = sum(xUS)

		gen pUS`i' = fUS/nUS 
		gen pFB`i' = fFB/nFB

		gen FBindex`i' = pFB`i'/(pUS`i' + pFB`i')
		drop xUS xFB fUS fFB nUS nFB

		}

		gen FBindex=.
		forvalues i=1980/2015{
		replace FBindex = FBindex`i' if birthyear==`i'
		}

		forvalues i=1980/2015{
		drop FBindex`i'
		drop pUS`i'
		drop pFB`i'
		}
					
			
		save "$main/3work/index_males.dta", replace	
	restore	


	* Female Index	
	use "$main/3work/raw_data_appended.dta", replace
	preserve
	drop if birthyear>=1985&birthyear<=1988
	drop if birthyear==2011
		keep if sex==2
		gen freq=1
		gen fb = 1-mnativity
		drop if fb==.
		drop if firstname==""

		collapse (sum) freq, by(firstname  birthyear fb)

		sort firstname birthyear fb

		bys firstname: egen ttf = sum(freq)
		sum ttf, d

		drop if ttf<100

		reshape wide freq, i(firstname birthyear) j(fb)

		rename freq0 freqUS
		rename freq1 freqFB

		forvalues i = 1980/2015{

		gen xUS = freqUS if birthyear<=`i'
		gen xFB = freqFB if birthyear<=`i'

		bys firstname: egen fUS = sum(xUS)
		bys firstname: egen fFB = sum(xFB)

		egen nFB = sum(xFB)
		egen nUS = sum(xUS)

		gen pUS`i' = fUS/nUS 
		gen pFB`i' = fFB/nFB

		gen FBindex`i' = pFB`i'/(pUS`i' + pFB`i')
		drop xUS xFB fUS fFB nUS nFB

		}

		gen FBindex=.
		forvalues i=1980/2015{
		replace FBindex = FBindex`i' if birthyear==`i'
		}

		forvalues i=1980/2015{
		drop FBindex`i'
		drop pUS`i'
		drop pFB`i'
		}
					
			
		save "$main/3work/index_females.dta", replace	
	restore		
		
		
		
	
		preserve
			keep if sex==1
			merge m:1 firstname birthyear using "$main/3work/index_males.dta", keep(1 3) nogen
			keep firstname sex birthyear totaleverborn mmaidenname mbirthyear mnativity mrace mbirthplace mbirthdate FBindex mage mfirstname medu
			tempfile males
			save `males', replace
		restore
		
		preserve
			keep if sex==2
			merge m:1 firstname birthyear using "$main/3work/index_females.dta", keep(1 3) nogen
			keep firstname sex birthyear totaleverborn mmaidenname mbirthyear mnativity mrace mbirthplace mbirthdate FBindex mage mfirstname medu
			tempfile females
			save `females', replace
		restore
		
		clear
		use `males', replace
		append using `females'

		

		
		cap drop freqUS freqFB ttf
		
		save "$main/3work/CAbirths_children_coded.dta", replace
		


	
*** Merge coded FB index dataset to mother's LNindex

	use "$main/3work/CAbirths_children_coded.dta", replace
	
	destring mbirthyear, replace
	compress
	
	* 1) drop these vars for grouping
	drop if mmaidenname ==""
	drop if mbirthplace=="NA" | mbirthplace==""
	drop if mfirstname==""
	drop if mbirthyear==. | mi(mbirthyear)
	drop if mbirthdate=="." | mbirthdate==""
	
	* incomplete data in this year
	drop if birthyear==2011
	
	* 2) mother fixed effects
	cap drop mgroup
	egen mgroup=group(mfirstname mmaidenname mbirthdate mbirthplace mbirthyear)

	
	* 3) create the education variable across all years BEFORE restricting to 20-40 yr old mothers
	replace meduc = -1 if meduc==99 // otherwise we pick up the missings as maximums
	bys mgroup: egen max_meduc = max(meduc)

	* 4) keep mother's aged 20-40, note: we calculate education level across all births subject to non-missing mother information
	keep if mage>=20 & mage<=40

	* 5) if haven't dropped already - keep only 1989-2015 (used all years to calculate name foreignness?)
	keep if birthyear>=1989
	
	save "$main/3work/compiled.dta", replace	
	
	
	
	
	
	
	
	
/*********************************************************************



		INDEX BASED OFF OF PARENT'S NAMES (Indexalt)





************************************************************************/	
	

	
	
********************************
* Data cleaning steps
	
	use "$main/3work/raw_data_appended.dta", replace
	
	* Male index
	preserve
	drop if birthyear==2011
	keep if fbirthyear>=1985 & fbirthyear<=1995
		gen freq=1
		gen fb = 1-mnativity
		drop if fb==.
		drop if ffirstname==""
		drop firstname
		rename ffirstname firstname
		
		* combine birth cohorts into 1
		collapse (sum) freq, by(firstname fb)

		sort firstname fb

		bys firstname: egen ttf = sum(freq)
		sum ttf, d

		drop if ttf<100

		reshape wide freq, i(firstname) j(fb)

		rename freq0 freqUS
		rename freq1 freqFB

		gen xUS = freqUS
		gen xFB = freqFB

		bys firstname: egen fUS = sum(xUS)
		bys firstname: egen fFB = sum(xFB)

		egen nFB = sum(xFB)
		egen nUS = sum(xUS)

		gen pUS = fUS/nUS 
		gen pFB = fFB/nFB

		gen FBindex = pFB/(pUS + pFB)
		drop xUS xFB fUS fFB nUS nFB
	
		drop pUS
		drop pFB
		
		gen sex = 1
	
		save "$main/3work/indexalt_males.dta", replace	
	restore	


	* Female Index	
	use "$main/3work/raw_data_appended.dta", replace
	preserve
	keep if mbirthyear>=1985&mbirthyear<=1995
	drop if birthyear==2011
		gen freq=1
		gen fb = 1-mnativity
		drop if fb==.
		drop if mfirstname==""
		drop firstname
		rename mfirstname firstname

		* collapse into one birth cohort
		collapse (sum) freq, by(firstname  fb)

		sort firstname fb

		bys firstname: egen ttf = sum(freq)
		sum ttf, d

		drop if ttf<100

		reshape wide freq, i(firstname) j(fb)

		rename freq0 freqUS
		rename freq1 freqFB

		gen xUS = freqUS
		gen xFB = freqFB

		bys firstname: egen fUS = sum(xUS)
		bys firstname: egen fFB = sum(xFB)

		egen nFB = sum(xFB)
		egen nUS = sum(xUS)

		gen pUS = fUS/nUS 
		gen pFB = fFB/nFB

		gen FBindex = pFB/(pUS + pFB)
		drop xUS xFB fUS fFB nUS nFB

		drop pUS
		drop pFB
		
		gen sex = 2
				
			
		save "$main/3work/indexalt_females.dta", replace	
	restore		
			
	
	
	
	
		preserve
			keep if sex==1
			merge m:1 firstname using "$main/3work/indexalt_males.dta", keep(1 3) nogen
			keep firstname sex birthyear totaleverborn mmaidenname mbirthyear mnativity mrace mbirthplace mbirthdate FBindex mage mfirstname medu ffirstname
			tempfile males
			save `males', replace
		restore
		
		* parents FBindex
		preserve
			use "$main/3work/indexalt_males.dta", replace
			rename firstname ffirstname
			rename FBindex FBindex_fathers
			keep ffirstname FBindex_fathers
			tempfile fathers
			save `fathers', replace
		restore
		
		preserve
			keep if sex==2
			merge m:1 firstname using "$main/3work/indexalt_females.dta", keep(1 3) nogen
			keep firstname sex birthyear totaleverborn mmaidenname mbirthyear mnativity mrace mbirthplace mbirthdate FBindex mage mfirstname medu ffirstname
			tempfile females
			save `females', replace
		restore
		
		* parents FBindex
		preserve
			use "$main/3work/indexalt_females.dta", replace
			rename firstname mfirstname
			rename FBindex FBindex_mothers
			keep mfirstname FBindex_mothers
			tempfile mothers
			save `mothers', replace
		restore
		
		clear
		use `males', replace
		append using `females'
		

		merge m:1 ffirstname using `fathers', nogen
		merge m:1 mfirstname using `mothers', nogen 
		

		
		cap drop freqUS freqFB ttf
		
		save "$main/3work/CAbirths_children_coded_indexalt.dta", replace	
	
	
	
	use "$main/3work/CAbirths_children_coded_indexalt.dta", replace
	
	destring mbirthyear, replace
	compress
	
	* 1) drop these vars for grouping
	drop if mmaidenname ==""
	drop if mbirthplace=="NA" | mbirthplace==""
	drop if mfirstname==""
	drop if mbirthyear==. | mi(mbirthyear)
	drop if mbirthdate=="." | mbirthdate==""
	
	* incomplete data in this year
	drop if birthyear==2011
	
	* 2) mother fixed effects
	cap drop mgroup
	egen mgroup=group(mfirstname mmaidenname mbirthdate mbirthplace mbirthyear)

	
	* 3) create the education variable across all years BEFORE restricting to 20-40 yr old mothers
	replace meduc = -1 if meduc==99 // otherwise we pick up the missings as maximums
	bys mgroup: egen max_meduc = max(meduc)

	* 4) keep mother's aged 20-40, note: we calculate education level across all births subject to non-missing mother information
	keep if mage>=20 & mage<=40

	* 5) if haven't dropped already - keep only 1989-2015 (used all years to calculate name foreignness?)
	keep if birthyear>=1989
	
	save "$main/3work/compiled_indexalt.dta", replace	
	
	

	
	
	
*************************
* Robustness dataset, where we only keep mothers who had all of their births in CA
	
*** Merge coded FB index dataset to mother's LNindex

	use "$main/3work/CAbirths_children_coded_indexalt.dta", replace
	
	destring mbirthyear, replace
	compress
	
	* 1) drop these vars for grouping
	drop if mmaidenname ==""
	drop if mbirthplace=="NA" | mbirthplace==""
	drop if mfirstname==""
	drop if mbirthyear==. | mi(mbirthyear)
	drop if mbirthdate=="." | mbirthdate==""
	
	* incomplete data in this year
	drop if birthyear==2011
	
	* 2) mother fixed effects
	cap drop mgroup
	egen mgroup=group(mfirstname mmaidenname mbirthdate mbirthplace mbirthyear)	
	
	* 3) create the education variable across all years BEFORE restricting to 20-40 yr old mothers
	replace meduc = -1 if meduc==99 // otherwise we pick up the missings as maximums
	bys mgroup: egen max_meduc = max(meduc)
	
	* 3.5) flag number of observations
	bys mgroup: gen num_obs = _N
	
	* results are similar if we drop missing totaleverborn here (same as in the main results, or if we recode to 0)
	* 9.318 or 9.307 million observations
	drop if totaleverborn==99
	* implement topcode here for flag
	sum totaleverborn, d
	replace totaleverborn = `r(p99)' if totaleverborn>`r(p99)' & !mi(totaleverborn)
	
	* calculate max children ever born by mother
	bys mgroup: egen max_births = max(totaleverborn)
	
	* all births in CA implies the number of times they appear in the dataset>=totaleverborn (b/c of topcode)
	gen all_in_ca = (num_obs>=max_births)
	keep if all_in_ca==1

	* 4) keep mother's aged 20-40, note: we calculate education level across all births subject to non-missing mother information
	keep if mage>=20 & mage<=40

	* 5) if haven't dropped already - keep only 1989-2015 (used all years to calculate name foreignness?)
	keep if birthyear>=1989
	
	save "$main/3work/compiled_all_in_ca_robust_indexalt.dta", replace		
		
	
	

	
	
	
	
	
	
	
	
********************************************************************************************************************	
	
* Additional robustness measures of F-index	(Index ALT)
	
	
	
	
	
	
******************************
* Country-specific index

	local countrylist "MX CH PI JA VN CU CN RE"
	
	use "$main/3work/raw_data_appended.dta", replace
	foreach s in `countrylist'{
	* Male index
	preserve
	keep if fbirthyear>=1985 & fbirthyear<=1995
	drop if birthyear==2011
		keep if sex==1
		gen freq=1
		gen fb = 1 if mbirthplace=="`s'"
		replace fb = 0 if mnativity==1
		replace freq = 0 if fb==.
		drop if fb==.
		drop firstname
		rename ffirstname firstname
		drop if firstname==""

		collapse (sum) freq, by(firstname  fb)

		sort firstname  fb

		bys firstname: egen ttf = sum(freq)
		sum ttf, d

		drop if ttf<100

		reshape wide freq, i(firstname ) j(fb)

		rename freq0 freqUS
		rename freq1 freqFB


		gen xUS = freqUS 
		gen xFB = freqFB 

		bys firstname: egen fUS = sum(xUS)
		bys firstname: egen fFB = sum(xFB)

		egen nFB = sum(xFB)
		egen nUS = sum(xUS)

		gen pUS = fUS/nUS 
		gen pFB = fFB/nFB

		gen FBindex = pFB/(pUS + pFB)
		drop xUS xFB fUS fFB nUS nFB



		drop pUS
		drop pFB
		
		save "$main/3work/index_males_`s'_indexalt.dta", replace	
		
restore					
}			
		
	


	* Female Index	
	use "$main/3work/raw_data_appended.dta", replace
	foreach s in `countrylist'{
	preserve
	keep if mbirthyear>=1985 & mbirthyear<=1995
	drop if birthyear==2011
		keep if sex==2
		gen freq = 1
		gen fb = 1 if mbirthplace=="`s'"
		replace fb = 0 if mnativity==1
		replace freq = 0 if fb==.
		drop if fb==.
		drop firstname
		rename mfirstname firstname
		drop if firstname==""

		collapse (sum) freq, by(firstname  fb)

		sort firstname  fb

		bys firstname: egen ttf = sum(freq)
		sum ttf, d

		drop if ttf<100

		reshape wide freq, i(firstname ) j(fb)

		rename freq0 freqUS
		rename freq1 freqFB

		

		gen xUS = freqUS
		gen xFB = freqFB

		bys firstname: egen fUS = sum(xUS)
		bys firstname: egen fFB = sum(xFB)

		egen nFB = sum(xFB)
		egen nUS = sum(xUS)

		gen pUS = fUS/nUS 
		gen pFB = fFB/nFB

		gen FBindex = pFB/(pUS + pFB)
		drop xUS xFB fUS fFB nUS nFB

		drop pUS
		drop pFB
		
		
		save "$main/3work/index_females_`s'_indexalt.dta", replace	
					
restore	
}		
		
		
			
	
	
	
******************************************
* NYSIIS

	use "$main/3work/raw_data_appended.dta", replace
	
	* Male index
	preserve
	drop if fbirthyear>=1985 & fbirthyear<=1995
	drop if birthyear==2011
		keep if sex==1
		drop firstname
		rename ffirstname firstname
		do "$main/2progs/AltIndex_parallel_sample/z_nysiis_parents.do" // note, this is the same program as z_nysiis.do
		gen freq=1
		gen fb = 1-mnativity
		drop if fb==.
		drop if firstname==""

		collapse (sum) freq, by(firstname fb)

		sort firstname fb

		bys firstname: egen ttf = sum(freq)
		sum ttf, d

		drop if ttf<100

		reshape wide freq, i(firstname) j(fb)

		rename freq0 freqUS
		rename freq1 freqFB

	

		gen xUS = freqUS
		gen xFB = freqFB

		bys firstname: egen fUS = sum(xUS)
		bys firstname: egen fFB = sum(xFB)

		egen nFB = sum(xFB)
		egen nUS = sum(xUS)

		gen pUS = fUS/nUS 
		gen pFB = fFB/nFB

		gen FBindex = pFB/(pUS + pFB)
		drop xUS xFB fUS fFB nUS nFB

		drop pUS
		drop pFB
		
					
			
		save "$main/3work/index_males_nysiis_indexalt.dta", replace	
	restore	


	* Female Index	
	use "$main/3work/raw_data_appended.dta", replace
	preserve
	drop if mbirthyear>=1985 & mbirthyear<=1995
	drop if birthyear==2011
		keep if sex==2
		drop firstname
		rename mfirstname firstname
		do "$main/2progs/AltIndex_parallel_sample/z_nysiis_parents.do"
		gen freq=1
		gen fb = 1-mnativity
		drop if fb==.
		drop if firstname==""

		collapse (sum) freq, by(firstname  fb)

		sort firstname fb

		bys firstname: egen ttf = sum(freq)
		sum ttf, d

		drop if ttf<100

		reshape wide freq, i(firstname ) j(fb)

		rename freq0 freqUS
		rename freq1 freqFB



		gen xUS = freqUS 
		gen xFB = freqFB

		bys firstname: egen fUS = sum(xUS)
		bys firstname: egen fFB = sum(xFB)

		egen nFB = sum(xFB)
		egen nUS = sum(xUS)

		gen pUS = fUS/nUS 
		gen pFB = fFB/nFB

		gen FBindex = pFB/(pUS + pFB)
		drop xUS xFB fUS fFB nUS nFB

		

		drop pUS
		drop pFB
		
					
			
		save "$main/3work/index_females_nysiis_indexalt.dta", replace	
	restore		
				
	
	
	
************************************
*** Assembly of index alt datasets	
	
	

	
	* nysiis cleaning
	use "$main/3work/raw_data_appended.dta", replace
	
		
	
	
	preserve
		keep if sex==1
		do "$main/2progs/z_nysiis.do" // this is the same program as z_nysiis_parents.do
		merge m:1 firstname using "$main/3work/index_males_nysiis_indexalt.dta", keep(1 3) nogen
		keep firstname sex birthyear totaleverborn mmaidenname mbirthyear mnativity mrace mbirthplace mbirthdate FBindex mage mfirstname medu ffirstname
		tempfile males
		save `males', replace
	restore
		
	preserve
		keep if sex==2
		do "$main/2progs/z_nysiis.do" // convert these names to nysiis
		merge m:1 firstname using "$main/3work/index_females_nysiis_indexalt.dta", keep(1 3) nogen
		keep firstname sex birthyear totaleverborn mmaidenname mbirthyear mnativity mrace mbirthplace mbirthdate FBindex mage mfirstname medu ffirstname
		tempfile females
		save `females', replace
	restore
		
	* parents FBindex
	preserve
		use "$main/3work/index_males_nysiis_indexalt.dta", replace
		rename firstname ffirstname
		rename FBindex FBindex_fathers
		keep ffirstname FBindex_fathers
		tempfile fathers
		save `fathers', replace
	restore
	
		
	* parents FBindex
	preserve
		use "$main/3work/index_females_nysiis_indexalt.dta", replace
		rename firstname mfirstname
		rename FBindex FBindex_mothers
		keep mfirstname FBindex_mothers
		tempfile mothers
		save `mothers', replace
	restore
			
		
		clear
		use `males', replace
		append using `females'
		
		do "$main/2progs/Altindex_parallel_sample/z_nysiis_fathers.do"
		cap drop var len nlen
		do "$main/2progs/Altindex_parallel_sample/z_nysiis_mothers.do"
		
		merge m:1 ffirstname using `fathers', nogen
		merge m:1 mfirstname using `mothers', nogen 
		

		
	cap drop freqUS freqFB ttf
	

	
	destring mbirthyear, replace
	compress
	
	* 1) drop these vars for grouping
	drop if mmaidenname ==""
	drop if mbirthplace=="NA" | mbirthplace==""
	drop if mfirstname==""
	drop if mbirthyear==. | mi(mbirthyear)
	drop if mbirthdate=="." | mbirthdate==""
	
	* incomplete data in this year
	drop if birthyear==2011
	
	* 2) mother fixed effects
	cap drop mgroup
	egen mgroup=group(mfirstname mmaidenname mbirthdate mbirthplace mbirthyear)

	
	* 3) create the education variable across all years BEFORE restricting to 20-40 yr old mothers
	replace meduc = -1 if meduc==99 // otherwise we pick up the missings as maximums
	bys mgroup: egen max_meduc = max(meduc)

	* 4) keep mother's aged 20-40, note: we calculate education level across all births subject to non-missing mother information
	keep if mage>=20 & mage<=40

	* 5) if haven't dropped already - keep only 1989-2015 (used all years to calculate name foreignness?)
	keep if birthyear>=1989
	
	save "$main/3work/compiled_nysiis_indexalt.dta", replace
	
	
	
	
	
	
	
* Update with country specific


	local countrylist "MX CH PI JA VN CU CN RE"
	
	foreach var in `countrylist'{
		use "$main/3work/raw_data_appended.dta", replace
			keep if mbirthplace=="`var'"
			
			
			preserve
				keep if sex==1
				merge m:1 firstname using "$main/3work/index_males_`var'_indexalt.dta", keep(1 3) nogen
				keep firstname sex birthyear totaleverborn mmaidenname mbirthyear mnativity mrace mbirthplace mbirthdate FBindex mage mfirstname medu
				tempfile males
				save `males', replace
			restore
				
			preserve
				keep if sex==2
				merge m:1 firstname using "$main/3work/index_females_`var'_indexalt.dta", keep(1 3) nogen
				keep firstname sex birthyear totaleverborn mmaidenname mbirthyear mnativity mrace mbirthplace mbirthdate FBindex mage mfirstname medu
				tempfile females
				save `females', replace
			restore
		
		clear
		use `males', replace
		append using `females'
		//
		
		tempfile appended_`var'
		save `appended_`var''
	}
	
	use "$main/3work/CAbirths_children_coded_indexalt.dta", replace
	foreach var in `countrylist'{
		drop if mbirthplace=="`var'"
	}
	
	
	
	foreach var in `countrylist'{
		append using `appended_`var''
	}	
	
	
	
	
	destring mbirthyear, replace
	compress
	
	* 1) drop these vars for grouping
	drop if mmaidenname ==""
	drop if mbirthplace=="NA" | mbirthplace==""
	drop if mfirstname==""	// these will be dropped later anyways since there is no FBindex for these observations
	drop if mbirthyear==. | mi(mbirthyear)
	drop if mbirthdate=="." | mbirthdate==""
	
	* incomplete data in this year
	drop if birthyear==2011
	
	* 2) mother fixed effects
	cap drop mgroup
	egen mgroup=group(mfirstname mmaidenname mbirthdate mbirthplace mbirthyear)	

	
	* 3) create the education variable across all years BEFORE restricting to 20-40 yr old mothers
	replace meduc = -1 if meduc==99 // otherwise we pick up the missings as maximums
	bys mgroup: egen max_meduc = max(meduc)

	* 4) keep mother's aged 20-40, note: we calculate education level across all births subject to non-missing mother information
	keep if mage>=20 & mage<=40

	* 5) if haven't dropped already - keep only 1989-2015 (used all years to calculate name foreignness?)
	keep if birthyear>=1989
	
	save "$main/3work/compiled_country_specific_indexalt.dta", replace	
	
	
	
	
	
		
