
clear all
set more off 
cap log close 
set mem 12g 
set matsize 11000

global dir "/home/research/cavoced/cred" 
global build "$dir/build"
global analy "$dir/analysis"


cd "$build/raw/CENCOLData"


*Start with matched kids
use $build/temp/flat_temp_round123_rr, clear
	keep if matched==1
	keep ssn_final
	duplicates drop
	rename ssn_final STUDENT_SSN
	tempfile doof
	save `doof'
	
*Merge on 1st-year stuff	
	merge 1:m STUDENT_SSN using $alldata/ST_HFAll
		keep if _merge==3
		drop _merge
		bysort STUDENT_SSN: gen asdf=_n
		keep if asdf==1
		drop asdf
		rename STUDENT_ID SX_STUDENT_ID
		rename COLLEGE_ID SX_COLLEGE_ID
		keep STUDENT_SSN ST_* SX*
*Merge on Enrollment stuff
	merge 1:m SX_STUDENT_ID SX_COLLEGE_ID using $alldata/SX_All
		keep if _merge==3
		save $build/temp/CENCOLdatacomplete_rr_SX, replace
*Merge on Awards
use `doof', clear
	merge 1:m STUDENT_SSN using $alldata/SP_All
	keep if _merge==3
		save $build/temp/CENCOLdatacomplete_rr_SP, replace
		
		
		
		
		
		

/*-----------------------------------------------------------------------------------------------
First, Expand the data to include one whole set of academic information for each lottery the person participates in
-------------------------------------------------------------------------------*/
	use "$build/temp/flat_temp_round123_rr", clear
		keep if matched==1
		drop ssn_final
		gen ssn_final=ssn_firstlast
		replace ssn_final=ssn_last if ssn_final==""
		replace ssn_final=ssn_firstlast_nod if ssn_final==""
		replace ssn_final=ssn_last_nod if ssn_final==""
		replace ssn_final=ssn_firstlast_nob if ssn_final==""
		replace ssn_final=ssn_last_nob if ssn_final==""
		rename ssn_final STUDENT_SSN
		duplicates tag STUDENT_SSN, gen(dupdup)
			tab dupdup
			drop if dupdup>0
		tempfile flat
		save `flat'
		
	use "$build/temp/CENCOLdatacomplete_rr_SX", clear
		drop _merge
		rename (SX_UNITS SX_CONTROL SX_TERM_ID)(sx_units_att sx_control sx_Term_)
	*drop running_number  lastname_a firstname_a stid gender day month year multiname order altname_first altname_last ssn* merge*   anym*   matched dup
		merge m:1 STUDENT_SSN using `flat'	
		keep if _merge==3|_merge==2
			drop _merge
			preserve
	
	
	
		keep stid STUDENT_SSN
		keep if stid!=""
		duplicates drop
		merge 1:m stid using $analy/temp/lottery_nurseCENCOL_flat_new
		keep if _merge==3
		rename numlot one
		keep STUDENT_SSN  one
		duplicates drop
		tempfile temp
		save `temp'
		restore
		merge m:1 STUDENT_SSN using `temp'
		
		drop _merge
		tempfile temp
		save `temp', replace
		keep if one==1
			tempfile temp1
			rename one numlotsessions
			save `temp1'
		use `temp', clear
			keep if one==2
			gen numlotsessions=1
			tempfile  temp2_1
			save `temp2_1'
			use `temp', clear
			keep if one==2
			gen numlotsessions=2
			tempfile temp2_2
			save `temp2_2'
		use `temp', clear
			keep if one==3
			gen numlotsessions=1
			tempfile  temp3_1
			save `temp3_1'
			use `temp', clear
			keep if one==2
			gen numlotsessions=2
			tempfile temp3_2
			save `temp3_2'
			use `temp', clear
			keep if one==3
			gen numlotsessions=3
			tempfile temp3_3
			save `temp3_3'
		use `temp', clear
			keep if one==4
			gen numlotsessions=1
			tempfile  temp4_1
			save `temp4_1'
			use `temp', clear
			keep if one==4
			gen numlotsessions=2
			tempfile temp4_2
			save `temp4_2'
			use `temp', clear
			keep if one==4
			gen numlotsessions=3
			tempfile temp4_3
			save `temp4_3'			
			use `temp', clear
			keep if one==4
			gen numlotsessions=4
			tempfile temp4_4
			save `temp4_4'
		
		clear
			use `temp1'
			append using `temp2_1'
			append using `temp2_2'
			append using `temp3_1'
			append using `temp3_2'
			append using `temp3_3'
			append using `temp4_1'
			append using `temp4_2'
			append using `temp4_3'
			append using `temp4_4'

*merge on the lottery outcomes data
	gen lotsessnum=numlotsessions
		drop numlotsessions
		merge m:1 stid lotsessnum using $analy/temp/lottery_nurseCENCOL_flat_new
		keep if _merge==3
*renames
	rename STUDENT_SSN ssn_og
	gen ssn=ssn_og+"_"+string(lotsessnum)
		preserve
		keep ssn ssn_og
		duplicates drop
		tempfile ssnlist
		save `ssnlist'
		restore
	
*Clean coursework	
		destring sx_units, replace ignore("NULL")
		gen prer_Anthro2 =sx_control=="CCC000228755"
		gen prer_AT11	=sx_control=="CCC000217181"
		gen prer_Bio20	=sx_control=="CCC000339884"
		gen prer_Bio21A	=sx_control=="CCC000522864"
		gen prer_Bio21B	=sx_control=="CCC000522863"
		gen prer_Bio22	=sx_control=="CCC000040636"
		gen prer_Bio24	=sx_control=="CCC000071246"
		gen prer_Bio31	=sx_control=="CCC000076302"
		gen prer_Bio5	=sx_control=="CCC000238971"
		gen prer_Chem3A	=sx_control=="CCC000324061"
		gen prer_Chem3B	=sx_control=="CCC000232902"
		gen prer_Eng1A	=sx_control=="CCC000118915"
		gen prer_FN40	=sx_control=="CCC000314744"
		gen prer_Math103 =sx_control=="CCC000127738"
		gen prer_Psy2	=sx_control=="CCC000229674"
		gen prer_Soc1A	=sx_control=="CCC000115895"
		
		gen fprer_finpre=.
		
		gen star_124020=sx_control=="CCC000214253"
		gen star_122500=sx_control=="CCC000330871"
		gen star_123010=(sx_control=="CCC000457262" | sx_control=="CCC000251598")
				/*Note: the second of the two controls is from 2009-2014, and the first is from before 2009*/
		gen star_121000=sx_control=="CCC000191356"
			foreach v of varlist prer*  star* {
				replace `v'=0 if sx_units>89 /*don't count if didn't actually enroll*/
				bysort ssn: egen t`v'=total(`v')
				replace `v'=0 if sx_units==0
				bysort ssn: egen p`v'=total(`v')
				replace t`v'=1 if t`v'>0
				replace p`v'=1 if p`v'>0
				}
		*People who PASSED the bio requirements
		gen widenet_124020=(pprer_Bio20==1 & pprer_Bio22==1) | (pprer_Bio21A==1 & pprer_Bio21B==1)
		gen widenet_122500=(pprer_Bio20==1 & pprer_Bio22==1) | (pprer_Bio21A==1 & pprer_Bio21B==1)|pprer_Bio24==1
		gen widenet_123010=(pprer_Bio20==1 & pprer_Bio22==1) | (pprer_Bio21A==1 & pprer_Bio21B==1)
		gen widenet_121000=(pprer_Bio20==1 & pprer_Bio22==1) | (pprer_Bio21A==1 & pprer_Bio21B==1)|pprer_Bio24==1
		*People who TOOK the bio requirements
		gen widestnet_124020=(tprer_Bio20==1 & tprer_Bio22==1) | (tprer_Bio21A==1 & tprer_Bio21B==1)
		gen widestnet_122500=(tprer_Bio20==1 & tprer_Bio22==1) | (tprer_Bio21A==1 & tprer_Bio21B==1)|tprer_Bio24==1
		gen widestnet_123010=(tprer_Bio20==1 & tprer_Bio22==1) | (tprer_Bio21A==1 & tprer_Bio21B==1)
		gen widestnet_121000=(tprer_Bio20==1 & tprer_Bio22==1) | (tprer_Bio21A==1 & tprer_Bio21B==1)|tprer_Bio24==1	
		*People who ENROLLED in the first course of the sequence
		gen enrol_124020=tstar_124020==1
		gen enrol_122500=tstar_122500==1
		gen enrol_123010=tstar_123010==1
		gen enrol_121000=tstar_121000==1
		
		*make sure demographic variables are defined just in the first term
		*destring ST_TERM, replace ignore("NULL")
		*destring first_term, replace ignore("NULL")
		*destring FIRST_TERM, replace ignore("NULL")
		gen cl_race=ST_RACE
		gen cl_gender=gender 
		
			drop ST_RACE ST_GENDER gender

		save $dir/build/temp/tempmatched2_rr, replace
		use  $dir/build/temp/tempmatched2_rr, clear
		
***************************************************************************************************************************
*Run Cleaning program
	*Program based in large part on data_cred_clean_prog

************************************************************	
	*need to get rid of awards data for this
	preserve
	*drop SP_*
	duplicates drop


	*demographics (define at first term)
			rename cl_gender gender
			rename cl_race  race
			gen byte sex_M = (gender=="M") if gender!="" 
			gen byte sex_F = (gender=="F") if gender!="" 
			*race
			gen byte race_W = (race =="W.")  
			gen byte race_B = (race =="B.")  
			gen byte race_H = (race =="H.")  
			gen byte race_A = (race =="A.")  
			gen byte race_O = (race =="F."|race=="N."|race =="P."|race=="T.") 
			gen byte race_X = (race=="X."|race=="")
			foreach z of varlist sex_* race_*{
				bysort ssn: egen blap=mean(`z')
				drop `z'
				rename blap `z'
				replace `z'=round(`z')
				}

	*School years
	gen term=sx_Term
	destring term, replace
	replace term=1900+(term/10) if term>200
	replace term=2000+(term/10) if term<200
	gen dropme=term-floor(term)
	gen schyear=floor(term)
		replace schyear=schyear-1 if dropme<0.4 
	*create a time variable consistent with the wage data
	gen yearqtr=floor(term)
		replace yearqtr=floor(term)+0.25 if dropme>0.3 & dropme<0.41
		replace yearqtr=floor(term)+0.5 if dropme>0.41 & dropme<0.61
		replace yearqtr=floor(term)+0.75 if dropme>.61 & dropme<.99
		drop term

	*First term at CENCOL
		*drop first_term
		bysort ssn: egen first_term=min(yearqtr)
		bysort ssn: egen bloop=min(yearqtr) if SX_COLLEGE=="CENCOL"
			bysort ssn: egen first_termCENCOL=mean(bloop)
			drop bloop

	*What year did they start the nursing program
			bysort ssn: egen classof_st12301=min(yearqtr) if star_12301==1
			bysort ssn: egen blap=mean(classof_st12301)
			drop classof_st12301
			rename blap classof_st12301
	*some other edits
	rename SX_GRADE grade
	replace grade=trim(grade)
	
	*age at term
			gen lotterm_1=firstapp_term

	destring year, replace
	destring month, replace
	gen birthqtr=year
		replace birthqtr=year+0.25 if month==4|month==5|month==6
		replace birthqtr=year+0.5  if month==7|month==8|month==9
		replace birthqtr=year+0.75 if month==10|month==11|month==12
	gen age=yearqtr-birthqtr
	gen age_atfirst=lotterm_1-birthqtr

	
			
				
	********************************************
	*Transcript information
	******************************************
	
	*units for GPA purposes: 
		egen tagclass=tag(ssn yearqtr sx_control)
		destring sx_units_att, gen(units_att) ignore("NULL")
			*recode units if withdrew, etc
			replace sx_units_att=0 if inlist(grade,"DR","FW","W","MW","XX")==1
			replace sx_units_att=sx_units_att*tagclass
		gen graden=4 if grade=="A"
		replace graden=3 if grade=="A+"
		replace graden=3.7 if grade=="A-"
		replace graden=3.3 if grade=="B+"
		replace graden=4 if grade=="P"
		replace graden=3 if grade=="B"
		replace graden=2.7 if grade=="B-"
		replace graden=2.3 if grade=="C+"
		replace graden=2 if grade=="C"
		replace graden=1.7 if grade=="C-"
		replace graden=1.3 if grade=="D+"
		replace graden=1 if grade=="D"
		replace graden=0.7 if grade=="D-"
		replace graden=0 if grade=="F"
		replace graden=graden*sx_units_att 
	/*GPA before first application*/
		gen numerator=graden if yearqtr<lotterm_1 & tagclass==1
			bysort ssn: egen totnumerator=total(numerator)
		gen denominator=sx_units_att if yearqtr<lotterm_1 & tagclass==1
			bysort ssn: egen unitsatt_prelot1=total(denominator)
		gen gpa_prelot1=totnumerator/unitsatt_prelot1
		gen unitsp=sx_units_att if graden>0 & graden!=. & yearqtr<lotterm_1 & tagclass==1
			bysort ssn: egen units_prelot1=total(unitsp)
		drop numerator denominator totnumerator unitsp

	/*GPA in prereq before enrollment*/	
		egen prereqflag=rowtotal(prer_*)
		gen gradenprereq=graden if prereqflag==1 & yearqtr<lotterm_1 & tagclass==1
		bysort ssn: egen totearnedprereq=total(gradenprereq)
		gen units_attprereq=sx_units_att if prereqflag==1 & gradenprereq~=.&yearqtr<lotterm_1 & tagclass==1
		bysort ssn: egen totattprereq=total(units_attprereq)
		gen gpa_prereq=totearnedprereq/totattprereq
			drop gradenprereq units_attpre totearnedpre
			
	/*GPA by term*/
		bysort ssn yearqtr: egen totgrad=total(graden)
		bysort ssn yearqtr: egen unitsatt_term=total(sx_units_att)
		gen gpa_term=totgrad/unitsatt_term
			drop totgrad
			sum gpa*
	

	*Took classes at another college?
		gen col=SX_COLL
			replace col="0"+col if length(col)==2
		gen enr_othcol=col!="CENCOL" & col!=""
		gen enr_othcol_prelot1=col!="CENCOL" & yearqtr<lotterm_1
			bysort ssn: egen zoop=total(enr_othcol)
			replace enr_othcol=zoop>0 
			bysort ssn: egen zoop2=total(enr_othcol_prelot1)
			replace enr_othcol_prelot1=zoop2>0
	*Took classes at another district?
		gen enr_othdis=(col!="CENCOL" & col!="572") & col!=""
		gen enr_othdis_prelot1=(col!="CENCOL" & col!="572") & yearqtr<lotterm_1
			bysort ssn: egen noop=total(enr_othdis)
			replace enr_othdis=noop>0 
			bysort ssn: egen noop2=total(enr_othdis_prelot1)
			replace enr_othdis_prelot1=noop2>0	
	*TRANSFER INFO
	*Do this at the end
	rename first_dat trans_date
	rename first_seg trans_seg
	replace trans_date="" if trans_date=="NULL"
	gen trans_schyear=substr(trans_date,1,4)
		destring trans_schyear, replace ignore("HW.")
	gen trans_mo=substr(trans_date,5,2)
		destring trans_mo, replace
	replace trans_schyear=trans_schyear+1 if trans_mo<7
	bysort ssn: egen mintrans=min(trans_schyear)
		drop if trans_schyear!=mintrans
		drop mintrans
	replace trans_seg="" if trans_seg=="NULL"
	
	
	
	#delimit;
	drop col    ST_AGE ST_ZIP    SX* sx_* dup _merge grade race gender  prereqflag  fpre*      graden* prer* star*       zoop*  dropme schyear tagclass;
	des;
	duplicates drop;
	duplicates report ssn yearqtr;
	#delimit cr
	
		tempfile mostofit
			save `mostofit'
		keep ssn classof* lotterm_1
			duplicates drop
			tempfile temp1
			save `temp1'
	restore
	

***********************************************************************	
*NOW JUST DO AWARDS 
***********************************************************************
	use $build/temp/CENCOLdatacomplete_rr_SP, clear
		rename STUDENT_SSN ssn_og
			drop _merge
			joinby ssn_og using `ssnlist', unmatched(both)
				drop _merge
		merge m:1 ssn using `temp1'
			
			
			cap destring SP_TERM, replace ignore("NULL")
			replace SP_TERM=1900+(SP_TERM/10) if SP_TERM>200
			replace SP_TERM=2000+(SP_TERM/10) if SP_TERM<200
			gen dropme3=SP_TERM-floor(SP_TERM)
			gen awyear=floor(SP_TERM)
		replace awyear=awyear-1 if dropme3<0.4 /*next spring counts as this year*/
	*get rid of awards that happen BEFORE Treatment, but create an indicator for students who get them
			gen pre_award=awyear<lotterm_1 & lotterm_1 !=.
			bysort ssn: egen asdf=total(pre_award)
				replace pre_award=asdf>0
				drop asdf
	replace SP_AW="" if SP_AW=="NULL"
	replace SP_TOP="" if length(SP_TOP)~=6
	replace SP_TOP="" if SP_TOP=="NULL"
	
	
	duplicates drop
	gen d_122500=SP_TOP=="122500" & inlist(SP_AW,"A","F","S","T")==1 & pre_award==0
	gen d_124020=SP_TOP=="124020" & inlist(SP_AW,"A","F","S","T")==1 & pre_award==0
	gen d_121000=SP_TOP=="121000" & inlist(SP_AW,"A","F","S","T")==1 & pre_award==0
	gen d_123010=SP_TOP=="123010" & inlist(SP_AW,"A","F","S","T")==1 & pre_award==0
	gen d_anytop2=substr(SP_TOP, 1, 2)=="12" & pre_award==0
	gen d_nontop2=substr(SP_TOP, 1, 2)!="12" & SP_AW~="" & pre_award==0
	gen d_noaward=SP_AW=="" & pre_award==0
	gen d_anyaward=SP_AW~="" & pre_award==0
	gen d_123010_CENCOL=d_123010==1 & SP_COLL=="CENCOL"
	gen d_12_CENCOL=d_anytop2==1 & SP_COLL=="CENCOL"
	gen d_123010_oth=d_123010==1 & d_123010_CENCOL==0
	gen d_12_oth=d_anytop2==1 & d_12_CENCOL==0

	foreach awa in 122500 124020 121000 123010 anytop2 nontop2  anyaward 123010_CENCOL 12_CENCOL 123010_oth 12_oth{
		bysort ssn: egen awa_`awa'=total(d_`awa')
		replace awa_`awa'=1 if awa_`awa'>1 & awa_`awa'~=.
		}	
	*award date for each type
	foreach awa in 122500 124020 121000 123010 anytop2 nontop2  anyaward 123010_CENCOL 12_CENCOL 123010_oth 12_oth{
		bysort ssn: egen blah=min(SP_TERM) if d_`awa'==1
		bysort ssn: egen awa_minterm_`awa'=mean(blah)
		drop blah
		}
		
		

	*flatten it out
keep ssn pre_award awa_*
	duplicates drop
	sort  ssn
	tempfile temp2
	save `temp2'
****************************************************
	
		use `mostofit', clear
	
	*MERGE BACK ON AWARDS INFO
	merge  m:1 ssn using `temp2'
	
	drop if lotterm<2005
	drop tprer* pprer* widenet*
			
	save $dir/analysis/temp/aca_awa_CENCOL_new_rr, replace
	use $dir/analysis/temp/aca_awa_CENCOL_new_rr, clear

/*------------------------------------------------------------------------
Stack them all
------------------------------------------------------------------------*/		
	gen mlot5_impu=.
		preserve
			gen lotnum=1
			foreach v in term adm rej alt inoth1 inoth2 imprej impu cat{
				gen lot_`v'=mlot1_`v'
				}
			
			tempfile temp1
			save `temp1', replace
		
			foreach z in 2 3 4 5{
			restore, preserve
			
			keep if mlot`z'_cat!=.
			gen lotnum=`z'
			foreach v in term adm rej alt inoth1 inoth2 imprej impu cat{
				gen lot_`v'=mlot`z'_`v'
				}
			tempfile temp`z'
			save `temp`z'', replace
			}
		restore
		use `temp1', clear
			append using `temp2'
			append using `temp3'
			append using `temp4'
			append using `temp5'
			
			
			
			
			
			
****************************************
*Last chunk of cleaning
******************************************

preserve
use $temdir/aca_CENCOL_withwages, clear
	keep ssn enrol_12301 classof
	drop if enro==.
	duplicates drop
	rename (enr class) (menr mcla)
	tempfile tempmenr
	save `tempmenr'
	restore
drop _merge
merge m:1 ssn using `tempmenr'
	replace enrol_12301=1 if menr==1 & enrol_12301==0
	replace classof=mcla if classof==. & enrol_12301==1

*PREAMBLE: to ensure all analyses using the same dataset
foreach z in 1 2 3 4 5{
	gen mlot`z'_adm2=(mlot`z'_cat==1|mlot`z'_cat==4|mlot`z'_cat==5)
	}
	gen lot_adm1=lot_adm
	gen lot_adm2=(lot_cat==1 | lot_cat==4 | lot_cat==5)	
*******************************************************************
*Fixed Effects
*******************************************************************
	*Change lottery terms to be quarter-specific (0.7 become 0.75 and 0.1 become 0.0)
	foreach var in mlot1_term mlot2_term mlot3_term mlot4_term mlot5_term lot_term{
		replace `var'=round(`var') if (`var'-round(`var')>0) & (`var'-round(`var')<0.15)
		replace `var'=`var'+0.05 if (`var'-round(`var')>-0.31) & (`var'-round(`var')<-0.28)
		replace `var'=round(`var')+0.5 if (`var'-round(`var')>0.29) & (`var'-round(`var')<0.31)
		}
		
	*Lottery fixed effects
	gen roundlot=round(lot_term)
		qui tab roundlot, gen(rlotfe)
		*qui tab lotyear_by_num, gen(rlotintfe)
	gen blap=lotnum*lot_term
		*qui tab blap, gen(lotint)
		qui tab lotnum, gen(lotnumfe)

	gen qtr=yearqtr-round(yearqtr)
		tab qtr, gen(qfe)
	gen year=round(yearqtr)
		tab year, gen(yfe)
	drop age
		gen age=round(age_atfirst+yearqtr-mlot1_term)
		gen agefor=age
			replace agefor=18 if age<18
			replace agefor=65 if age>65
		qui tab agefor, gen(agfe)
****************************************************************
*Weight
****************************************************************		
bysort ssn: egen numlots=max(lotnum)
		/*gen invweight=1/numlots
foreach g in 1 2 3 4 {
	bysort ssn: egen numlots`g'=max(lotnum) if lotnum<=`g'
		gen invweight`g'=1/numlots`g'
	}*/
	egen appflag=tag(ssn lotnum)
	egen tagflag=tag(ssn)
*variable for clustering
		gen ssnl=ssn+string(lotnum)

*****************************************************************
*Other Variables
***************************************************************	
*Variable for years since lottery
	gen lt=.
		replace lt=0.25 if lot_term-floor(lot_term)>0.09 & lot_term-floor(lot_term)<0.11
		replace lt=0.5 	if lot_term-floor(lot_term)>0.29 & lot_term-floor(lot_term)<0.31
		replace lt=0.75 if lot_term-floor(lot_term)>0.68 & lot_term-floor(lot_term)<0.71
	gen yqsince_lot=yearqtr-(floor(lot_term)+lt)
	gen yearssince_lot=round(yqsince_lot)
	*qui tab yearssince_lot, gen(ysfe)
*Other variables
gen logwages=ln(wages)
gen emp=logwages!=.
drop if yearqtr==.

	gen gpa_missing=gpa_prelot1==.
	replace gpa_prelot1=0 if gpa_prelot1==.
	gen race_OBX=(race_B==1 | race_X==1 | race_O==1)

	*Create variables for everwin, neverwin, everalt, etc.
		gen everwin=mlot1_adm==1 | mlot2_adm==1|mlot3_adm==1 | mlot4_adm==1 | mlot5_adm==1
		gen everalt=mlot1_alt==1 | mlot2_alt==1|mlot3_alt==1 | mlot4_alt==1 | mlot5_alt==1
	   gen everoth1=mlot1_inoth1==1 | mlot2_inoth1==1|mlot3_inoth1==1 | mlot4_inoth1==1 | mlot5_inoth1==1
	   gen everoth2=mlot1_inoth2==1 | mlot2_inoth2==1|mlot3_inoth2==1 | mlot4_inoth2==1 | mlot5_inoth2==1
	   gen neverwin=everwin==0 & everalt==0 & everoth1==0 & everoth2==0
	
		gen everwin1=everwin
		gen neverwin1=neverwin
		gen everwin2=everwin==1 | everoth1==1 | everoth2==1
		gen neverwin2=everwin2==0 & everalt==0
		
**Merge on financial aid in 4 years prior to first lottery
merge m:1 ssn using $analy/temp/sfa_CENCOL_any, gen(zzzzzzzzz)
gen missfin=sfa_bog==. | sfa_pell==. | sfa_calg==.
foreach v of varlist sfa*{
	replace `v'=0 if `v'==.
	}
*******************************************
*Labels
*******************************************
	label var  sex_F "Female"
	label var  race_W "White"
	label var  race_B "Black"
	label var  race_H "Hispanic"
	label var  race_A "Asian"	
	label var race_OBX "Other Race"
	label var  age_atfirst "Age"
	label var  gpa_prelot1 "GPA"		
	label var  units_prelot1 "Units"
	label var  unitsatt_prelot1 "Units"
	label var  enr_othcol_prelot1 "Enrolled at other college"
	label var  enr_othdis_prelot1 "Enrolled in other district"
	 label var emp_pre "Employed"
	 label var emp_qspre816  "Quarters Employed"
	 label var emp_pre816_half "Consistent Employment"
	 label var wag_pre  "Quarterly Earnings"
	 label var wag_pre4 "Quarterly Earnings"
	 label var wag_lnpre4 "Quarterly (log) Earnings)"
	 label var sfa_pell "Had Pell Grant"
	 label var sfa_bog "Had BOG Waiver"
	 label var sfa_cal "Had Calgrant"
	 label var sfa_loan "Had Loans"
	 label var emp_hepre "Employed in Health"
	 label var emp_hepre4 "Employed in Health"
	 

global tabvarlist "sex_F race_W race_B race_H race_A 				 age_atfirst gpa_prelot1 		 units_prelot1 unitsatt_prelot1 enr_othcol_prelot1 enr_othdis_prelot1 emp_pre emp_qspre816 emp_pre816_half wag_pre wag_pre4 emp_hepre emp_hepre4"
global regvarlist11 "sex_F 			race_B race_H race_A race_O race_X age_atfirst gpa_prelot1 	enr_othdis_prelot1 emp_qspre816 emp_pre816_half emp_hepre4"
global regvarlist12 "sex_F 			race_B race_H race_A race_O race_X age_atfirst gpa_prelot1 units_prelot1		 enr_othdis_prelot1 emp_qspre816 emp_pre816_half emp_hepre4"
global regvarlist21 "sex_F 		 	race_H race_A race_OBX age_atfirst gpa_prelot1 		 enr_othdis_prelot1 emp_qspre816 emp_pre816_half emp_hepre4"
global regvarlist22 "sex_F 		 	race_H race_A race_OBX age_atfirst gpa_prelot1 units_prelot1 enr_othdis_prelot1 emp_qspre816 emp_pre816_half emp_hepre4"

global regvarlist23 "sex_F 		 	race_H race_A race_OBX age_atfirst gpa_prelot1 units_prelot1 enr_othdis_prelot1 "

			
	do  "$prodir/CENCOL/CENCOL_preamble2.do"

*replace pre-labor market as 0s if missing
foreach z of varlist emp_pre* wag_* emp_h*{ 
	replace `z'=0 if `z'==.
	}
merge m:1 ssn using `temptrn', gen(mergetran)
	replace trans_flag=trans_flag==1
	gen trans_UCCSU=trans_seg=="UC"|trans_seg=="CSU"

	gen trash=wages if yearqtr<mlot1_term & yearqtr>(mlot1_term-4) & yearqtr!=. & mlot1_term!=.
		bysort ssn lotnum: egen emp_prewag=mean(trash)
			replace emp_prewag=0 if emp_prewag==.
			drop trash
	gen trash=wages!=. if yearqtr<mlot1_term & yearqtr>(mlot1_term-4) & yearqtr!=. & mlot1_term!=.
		bysort ssn lotnum: egen emp_numq=total(trash)
			gen emp_anypre=emp_numq>0
			drop trash
	gen trash=work_health==1 if yearqtr<mlot1_term & yearqtr>(mlot1_term-4) & yearqtr!=. & mlot1_term!=.
		bysort ssn lotnum: egen zzz=total(trash)
			gen hemp_anypre=zzz>0
			drop zzz trash
		drop emp_pre wag_pre
		rename emp_prewag wag_pre
		rename emp_anypre emp_pre
		
		
rename awa_123010_CENCOL awa
rename enrol_123010 enr
rename work_health hemp
gen enrn=(enr==1 & classof-lot_term<1 & classof-lot_term>-.1)


label var awa "Finish Program"
label var enr "Start Program"

drop if mlot1_term==.

	gen wages0=wages
		replace wages0=0 if wages==.

	gen enrolled=unitsatt_term>6 & unitsatt_term!=.
	gen ysince_lot1=yearqtr-mlot1_term
	gen ysince_lot=yearqtr-lot_term
		replace ysince_lot=. if ysince_lot>100

	gen flot=numlots==5
	qui tab mlot1_term, gen(lot1fe)
	gen rmlot1=round(mlot1_term)
		qui tab rmlot, gen(rlot1fe)		
	gen rys1=round(ysince_lot1)
	gen rys=round(ysince_lot)


	*Intervals

		gen intpre=rys<0 & rys>-4
		gen int01=rys==0 | rys==1
		gen int28=rys>=2 & rys<9
		gen int38=rys>=3 & rys<9
		gen int48=rys>=4 & rys<9
		gen int58=rys>=5 & rys<9
		gen int68=rys>=6 & rys<9
		gen int24=rys>=2 & rys<5
		gen int27=rys>=2 & rys<8
		gen int57=rys>=5 & rys<8
		
	*O-wage
	gen logwage0=logwage
		replace logwage0=0 if logwages==.
		
	gen awa2=awa_123010	
	*Globals
	global racevar "race_H race_A race_OBX age_prelot"
	global acadvar "gpa_prelot1  enr_othdis_prelot1 "
	global empvar "wag_pre emp_pre emp_hepre"


qui tab lot_term, gen(loofe)
		label var lot_adm "Win Lottery"
		label var awa "Finish Program"
		label var awa2 "Finish any ADN"
		label var enr "Start Program"
gen doof=.	

		
gen enrn=(enr==1 & classof-lot_term<1 & classof-lot_term>-.1)
			

*Indicator for each lottery
		gen lot_adm_1=(lot_adm==1)*(lotnum==1)
		gen lot_adm_2=(lot_adm==1)*(lotnum==2)
		gen lot_adm_3=(lot_adm==1)*(lotnum==3)
		gen lot_adm_4=(lot_adm==1)*(lotnum==4)
		gen win14=lot_adm
		label var lot_adm "Win 1st Lottery"
		label var lot_adm_1 "Win 1st Lottery"
		label var lot_adm_2 "Win 2nd Lottery"
		label var lot_adm_3 "Win 3rd Lottery"
		label var lot_adm_4 "Win 4th Lottery"
		label var win14 	"Win Lottery"
		label var timein1 "Years in CC"
		label var numlots "Number of Lotteries Participated"
		
		
		
	global racevar "age_at  race_H race_A race_OBX"
	global acadvar "gpa_prelot1   "
	global empvar "hemp_anypre emp_pre "	
			
			
		save $dir/analysis/temp/aca_awa_CENCOL_stacked_rr, replace
		
		
		
	
	
	
************************************************************************************
*Gather earnings and NAICS data for CENCOL folks
************************************************************************************
*First, get list of SSN's from CENCOL.
use  "$temdir/aca_awa_CENCOL_stacked_rr", clear
keep ssn
	*take out the underscore
	gen STUDENT_SSN=substr(ssn,1,strpos(ssn,"_")-1)
	drop ssn
duplicates drop
tempfile ssnlist
save `ssnlist'

*Create an empty dataset
clear
set obs 1
gen STUDENT_SSN="DROPME"
save $anadir/CENCOL_earnings_all_rr, replace
*Merge CENCOL Students to their earnings and save
cd $build/raw/earnings
forvalues y=2000/2014{
	forvalues q=1/4{
		clear
		use `ssnlist'
		merge 1:m STUDENT_SSN using wages_all_`y'_`q'
		keep if _merge==3
		drop _merge
		append using $anadir/CENCOL_earnings_all_rr
		save $anadir/CENCOL_earnings_all_rr, replace
		}
		}
		
		
*Do the same thing for NAICS file
clear
set obs 1
gen STUDENT_SSN="DROPME"
save $anadir/CENCOL_earnings_naics_rr, replace
*Merge CENCOL Students to their earnings and save
cd $build/raw/earnings
forvalues y=2000/2014{
	forvalues q=1/4{
		clear
		use `ssnlist'
		merge 1:m STUDENT_SSN using wages_naics_`y'_`q'
		keep if _merge==3
		drop _merge
		append using $anadir/CENCOL_earnings_naics_rr
		save $anadir/CENCOL_earnings_naics_rr, replace
		}
		}		




**************************************************************
*Merge WAGES on to CENCOL data
*************************************************************
use  "$temdir/aca_awa_CENCOL_stacked_rr", clear
gen STUDENT_SSN=substr(ssn,1,strpos(ssn,"_")-1)
drop _merge
*variables for quarters since first lottery
gen qts_since=round(yearqtr-mlot1_term, 0.25)

	*Separate into time-varying and time-nonvarying
		*Time-invariant
			keep ssn STUDENT_SSN mlot* lot* enrol* sex* race* age_at *prelot1 awa_* 
			duplicates drop
	joinby  STUDENT_SSN using $anadir/CENCOL_earnings_all_rr, unmatched(both)
	drop if _merge==2 
*Make Wages real
	destring WAGES, replace
	destring YEAR, replace
	destring Q, replace
 	 replace WAGES=WAGES*(233.17/233.71) if YEAR==2015
 	 replace WAGES=WAGES*(233.17/233.92) if YEAR==2014
 	 replace WAGES=WAGES*(233.17/230.28) if YEAR==2013
 	 replace WAGES=WAGES*(233.17/226.67) if YEAR==2012
 	 replace WAGES=WAGES*(233.17/220.22) if YEAR==2011
 	 replace WAGES=WAGES*(233.17/216.69) if YEAR==2010
 	 replace WAGES=WAGES*(233.17/211.14) if YEAR==2009
 	 replace WAGES=WAGES*(233.17/211.08) if YEAR==2008
 	 replace WAGES=WAGES*(233.17/202.42) if YEAR==2007
 	 replace WAGES=WAGES*(233.17/198.30) if YEAR==2006
 	 replace WAGES=WAGES*(233.17/190.70) if YEAR==2005
 	 replace WAGES=WAGES*(233.17/185.20) if YEAR==2004
 	 replace WAGES=WAGES*(233.17/181.70) if YEAR==2003
 	 replace WAGES=WAGES*(233.17/177.10) if YEAR==2002
 	 replace WAGES=WAGES*(233.17/175.10) if YEAR==2001
 	 replace WAGES=WAGES*(233.17/168.80) if YEAR==2000
 	 replace WAGES=WAGES*(233.17/164.30) if YEAR==1999
		rename WAGES wages
********************************
	gen yearqtr=YEAR+(Q-1)/4
	*create time-invariant labor market covariates
		*Employed, ever, prior to first application
			gen trash=wages!=. & yearqtr<lotterm_1
			bysort ssn lotnum : egen emp_pre=total(trash)
				replace emp_pre=emp_pre>0
				drop trash
		*Employed, ever, in 4 years prior to first app
			gen trash=wages!=. & yearqtr<lotterm_1 & yearqtr>(lotterm_1-4) & yearqtr!=.
			bysort ssn lotnum: egen emp_qspre816=total(trash)
				gen emp_pre816=emp_qspre816>0
		*Employed in at least 8 of 16 qtrs prior to first app
				gen emp_pre816_half=emp_qs>=8
				drop trash		
		*Average earnings (log) prior to first app
			gen trash=wages if yearqtr<lotterm_1
			bysort ssn lotnum: egen wag_pre=mean(trash)
				gen wag_lnpre=ln(wag_pre)
				drop trash
		*Average earnings (log) in 4 years prior to first app
			gen trash=wages if  yearqtr<lotterm_1 & yearqtr>(lotterm_1-4) & yearqtr!=.
			bysort ssn lotnum: egen wag_pre4=mean(trash)
				gen wag_lnpre4=ln(wag_pre4)
				drop trash
		keep ssn lotnum yearqtr wages emp_* wag_*
		duplicates drop
		tempfile tempwag
		save `tempwag'
		
		
		

**************************************************************
*Merge INDUSTRY on to CENCOL data
*************************************************************
use  "$temdir/aca_awa_CENCOL_stacked_rr", clear
gen STUDENT_SSN=substr(ssn,1,strpos(ssn,"_")-1)
drop _merge
*variables for quarters since first lottery
gen qts_since=round(yearqtr-mlot1_term, 0.25)


	*Separate into time-varying and time-nonvarying
		*Time-invariant
			keep ssn STUDENT_SSN mlot* lot* enrol* sex* race* age_at *prelot1 awa_* 
			duplicates drop
	joinby  STUDENT_SSN using $anadir/CENCOL_earnings_naics_rr, unmatched(both)
	drop if _merge==2 		
	destring WAGES, replace
	destring YEAR, replace
	destring Q, replace
	gen yearqtr=YEAR+(Q-1)/4
	rename WAGES wages
		*Quarterly indicator for work in health at all
		gen trash=substr(NAICS,1,2)=="62"
			bysort ssn lotnum YEAR QT: egen work_health=sum(trash)
			replace work_health=work_health>0
			drop trash
		*Quarterly indicator for health is main occupation
		bysort ssn lotnum YEAR QT: egen max=max(wag)
			gen trash=(substr(NAICS,1,2)=="62" & wag==max)
			bysort ssn lotnum YEAR QT: egen workm_health=total(trash)
				replace workm_health=workm_health>0
				drop trash
			
		*Employed, ever, in health industry prior to first app
			gen trash=work_health==1 & yearqtr<lotterm_1
			bysort ssn lotnum : egen emp_hepre=total(trash)
				replace emp_hepre=emp_hepre>0
				drop trash		
		*Employed, ever, in health industry in 4 years prior to first app
			gen trash=work_health==1 & yearqtr<lotterm_1 & yearqtr>(lotterm_1-4) & yearqtr!=.
			bysort ssn lotnum : egen emp_hepre4=total(trash)
				replace emp_hepre4=emp_hepre4>0
				drop trash	
		*Employed, ever, in health industry prior to first app AS MAIN OCC
			gen trash=workm_health==1 & yearqtr<lotterm_1
			bysort ssn lotnum : egen empm_hepre=total(trash)
				replace empm_hepre=empm_hepre>0
				drop trash		
		*Employed, ever, in health industry in 4 years prior to first app AS MAIN OCC
			gen trash=workm_health==1 & yearqtr<lotterm_1 & yearqtr>(lotterm_1-4) & yearqtr!=.
			bysort ssn lotnum : egen empm_hepre4=total(trash)
				replace empm_hepre4=empm_hepre4>0
				drop trash		

		keep ssn lotnum yearqtr emp_* empm* work_* workm_*
		duplicates drop
		tempfile tempind
		save `tempind'
*********************************************************************
**Merge it all together
*********************************************************************

*Make a balanced panel skeleton
use  "$temdir/aca_awa_CENCOL_stacked_rr", clear
	keep ssn lotnum
	duplicates drop
	preserve
	gen yearqtr=1992.5
	tempfile tempstack
	save `tempstack', replace
	forvalues y=1992.75 (0.25) 2014.25{
		restore, preserve
		gen yearqtr=`y'
		append using `tempstack'
		save `tempstack', replace
		}
		restore
	


*First use time-invariant stuff, then add time-varying stuff to it.
use  "$temdir/aca_awa_CENCOL_stacked_rr", clear
	keep ssn  mlot* lot* enrol* sex* race* age_at *prelot1 awa_* gpa_prereq pre_award first_term* classof_st12301
	duplicates drop
merge 1:m ssn lotnum using `tempstack'
	tab _merge
	drop _merge
merge 1:1 ssn lotnum  yearqtr using `tempwag'
	rename _merge mergewag
merge 1:1 ssn lotnum yearqtr using `tempind'
	rename _merge mergeind
	tab mergewag 
	tab mergeind
	drop mergewag mergeind
	preserve
	use  "$temdir/aca_awa_CENCOL_stacked", clear
	keep ssn lotnum yearqtr age gpa_term enr_othcol enr_othdis unitsatt_term
	tempfile tempoth
	save `tempoth'
	restore
	
merge 1:1 ssn lotnum yearqtr using `tempoth'



	save $temdir/aca_CENCOL_withwages_rr, replace

			
			
			
			
			