/******************************************************************************************
Nurse Lottery Readin Program
****************************************************************************************/
clear all
set more off 
cap log close 
set mem 12g 
set matsize 11000

global dir "/home/research/cavoced/cred" 
global build "$dir/build"
global analy "$dir/analysis"
log using "$build/prog/CENCOLlottery/nurse_lottery_readin.log", replace


cd "$build/raw/CENCOLData"

*Now read in the nursing list, and make it wide
	insheet using NursingLotteries.Data.new_MG.csv, names clear
	gen orderitcamein=_n
	des
	
	destring stid, gen(stid_num) ignore("NR")
	tab evalstat/*many are ineligible, and some have the reason why*/
		keep if evalstat=="Eligible"|evalstat=="Eigible"|evalstat=="Eiigible"|evalstat=="Eiligible"|evalstat=="Elgible"|evalstat=="eligible"
		drop evalstat
	*drop if don't have a STID
		drop if stid=="NR" | stid==""
	*first application date
		replace appl=upper(appl)
		replace appl=subinstr(appl," ","",.)
		replace appl=subinstr(appl,"SPIRING","SPRING",.)
		replace appl=subinstr(appl,"SPRINTG","SPRING",.)
		replace appl=subinstr(appl,"SPING","SPRING",.)
		replace appl=subinstr(appl,"FAL2","FALL2",.)
		replace appl=subinstr(appl,"`","",.)
		replace appl=subinstr(appl,"20205","2005",.)
		replace appl=subinstr(appl,"21013","2013",.)
		replace appl=subinstr(appl,"213","2013",.)
		
		gen firstapp_term=0.7 if substr(appl,1,4)=="FALL"
			replace appl=subinstr(appl,"FALL","",.)
		replace firstapp_term=0.1 if substr(appl,1,6)=="SPRING"
			replace appl=subinstr(appl,"SPRING","",.)
		replace firstapp_term=0.3 if substr(appl,1,6)=="SUMMER"
			replace appl=subinstr(appl,"SUMMER","",.)
		destring appl, gen (tempyearapp)
			replace firstapp_term=round(tempyearapp)+firstapp_term
			drop appl tempyearapp
			
	*Program applied for
		gen program_TRN=program=="TRN"|program=="RT"|program=="RN"|program=="PMRN"|program=="PRN"
			bysort stid firstapp: egen bloop=total(program_TRN)
			replace program_TRN=bloop>0
		gen program_LVN=strpos(program,"LV")>0
			drop bloop
			bysort stid firstapp: egen bloop=total(program_LVN)
			replace program_LVN=bloop>0	
			drop bloop
			
	*Lottery Status
		replace lottery=upper(lottery)
			gen lot_unselected=lottery=="UNSELECTED"
				replace lot_unselected=1 if lottery=="UNSLECTED"
			gen lot_selected=substr(lottery,1,6)=="SELECT"
				replace lot_selected=1 if substr(lottery,1,6)=="SUMMER"
				replace lot_selected=1 if substr(lottery,1,6)=="SPRING"
			gen lot_alternat=strpos(lottery,"ALTER")!=0
			
			gen lot_catc="S" if lot_s==1
				replace lot_cat="U" if lot_u==1
				replace lot_cat="A" if lot_a==1
			gen lot_cat=1 if lot_s==1
				replace lot_cat=2 if lot_u==1
				replace lot_cat=3 if lot_a==1
			drop lotteryst
	*Other way to get in: the grant updated status
		gen in_otherway1=substr(upper(grantupdatedstatus),1,8)=="SELECTED"
			gen z1=in_otherway1
		gen in_otherway2=substr(upper(grantupdatedstatus2),1,8)=="SELECTED"
			gen z2=in_otherway2
		gen alt_otherway=inlist(substr(upper(grantupdatedstatus),1,3),"ALT","ADD")==1
			replace alt_otherway=1 if inlist(substr(upper(grantupdatedstatus2),1,3),"ALT","ADD")==1
			bysort stid: egen glap=total(in_otherway1)
			replace in_otherway1=glap>0
			drop glap
			bysort stid: egen glap=total(in_otherway2)
			replace in_otherway2=glap>0
			drop glap	
			bysort stid: egen glap=total(alt_otherway)
			replace alt_otherway=glap>0
			drop glap
	*Forwarded (ie multiple applications)
	
			gen forward_num=upper(forwarded)
			replace forward_num=subinstr(forward_num," ","",.)
			replace forward_num=subinstr(forward_num,"YES","",.)
			replace forward_num=subinstr(forward_num,"#","",.)
			replace forward_num=subinstr(forward_num,"-","",.)			
			replace forward_num=subinstr(forward_num,"*","",.)			
			replace forward_num=subinstr(upper(forward_num),"NO","0",.)
			replace forward_num=subinstr(upper(forward_num),"N0","0",.)
			drop if forward_num=="I" /*there's just one of these cases*/
			destring forward_num, replace
			
	*Forward-to-term (ie each lottery they apply to)
		gen forward_term=upper(forwardt)
			replace forward_t= subinstr(forward_t," ","",.)
		gen forwardn_term=0.7 if substr(forward_t,1,4)=="FALL"
			replace forward_term=subinstr(forward_term,"FALL","",.)
		replace forwardn_term=0.1 if substr(forward_term,1,6)=="SPRING"
			replace forward_term=subinstr(forward_term,"SPRING","",.)
		replace forwardn_term=0.3 if substr(forward_term,1,6)=="SUMMER"
			replace forward_term=subinstr(forward_term,"SUMMER","",.)
					
		gen fileclosed=(forwardn_term==. & forward_term!="")
				replace forward_term="" if substr(forward_term,1,4)=="CLOS"
				replace forward_term="" if substr(forward_term,1,1)=="A"
				replace forward_term="" if substr(forward_term,1,1)=="D"
				replace forward_term="" if substr(forward_term,1,1)=="R"
		destring forward_term, gen (tempyearapp) ignore("FILECLOSEDNRPENDINGWITHDRAW")
			replace forwardn_term=tempyearapp+forwardn_term		


/*-----------------------------------------------------------------------------------------
Code the lottery outcomes
-------------------------------------------------------------------------------------------*/

*keep only RN applicants
keep if program_T==1

*keep only necessary variables
keep stid firstapp forward_num lot_cat forwardn_term fileclose program_T in_otherway* z1 z2 alt_other
duplicates drop
sort stid  forward_num


replace forward_num=1 if forward_num==0
gen lot_cat_imp=1 if lot_cat==.
replace lot_cat=2 if lot_cat==.

**************************		
	*First lottery information
	egen appflag=tag(stid firstapp)
	gen lot1_term=firstapp*appflag 
	gen lot1_adm=lot_cat==1 if forward_num==1
	gen lot1_rej=lot_cat==2 if forward_num==1
	gen lot1_alt=lot_cat==3 if forward_num==1
	gen lot1_inoth1=in_otherway1 if forward_num==1
	gen lot1_inoth2=in_otherway2 if forward_num==1
	gen lot1_imprej=lot_cat_imp==1 if forward_num==1
	gen lot1_altoth=alt_oth==1 if forward_num==1
	*Second lottery information
	gen lot2_term=forwardn_term if forward_num==1
	gen lot2_partic=forward_num==2
	gen lot2_adm=lot_cat==1 if forward_num==2
	gen lot2_rej=lot_cat==2 if forward_num==2
	gen lot2_alt=lot_cat==3 if forward_num==2
	gen lot2_inoth1=in_otherway1 if forward_num==2
	gen lot2_inoth2=in_otherway2 if forward_num==2
	gen lot2_imprej=lot_cat_imp==1 if forward_num==2
	gen lot2_altoth=alt_oth==1 if forward_num==2
	*Third lottery information
	gen lot3_term=forwardn_term if forward_num==2
	gen lot3_partic=forward_num==3
	gen lot3_adm=lot_cat==1 if forward_num==3
	gen lot3_rej=lot_cat==2 if forward_num==3
	gen lot3_alt=lot_cat==3 if forward_num==3
	gen lot3_inoth1=in_otherway1 if forward_num==3
	gen lot3_inoth2=in_otherway2 if forward_num==3
	gen lot3_imprej=lot_cat_imp==1 if forward_num==3
	gen lot3_altoth=alt_oth==1 if forward_num==3
	*Fourth lottery information
	gen lot4_term=forwardn_term if forward_num==3
	gen lot4_partic=forward_num==4
	gen lot4_adm=lot_cat==1 if forward_num==4
	gen lot4_rej=lot_cat==2 if forward_num==4
	gen lot4_alt=lot_cat==3 if forward_num==4
	gen lot4_inoth1=in_otherway1 if forward_num==4
	gen lot4_inoth2=in_otherway2 if forward_num==4
	gen lot4_imprej=lot_cat_imp==1 if forward_num==4
	gen lot4_altoth=alt_oth==1 if forward_num==4
	*Fifth lottery information
	gen lot5_term=forwardn_term if forward_num==4
	gen lot5_partic=forward_num==5
	gen lot5_adm=lot_cat==1 if forward_num==5
	gen lot5_rej=lot_cat==2 if forward_num==5
	gen lot5_alt=lot_cat==3 if forward_num==5
	gen lot5_inoth1=in_otherway1 if forward_num==5
	gen lot5_inoth2=in_otherway2 if forward_num==5
	gen lot5_imprej=lot_cat_imp==1 if forward_num==5
	gen lot5_altoth=alt_oth==1 if forward_num==5
	
	foreach vv of varlist lot1* lot2* lot3* lot4* lot5*{
		bysort stid program_T firstapp: egen m`vv'=total(`vv')
		}
/*------------------------------------------------------
*Cleaning to the mean variables	*/

	
	*If admitted, blank out participation in subsequent lotteries
		foreach lot in 2 3 4 5{
		replace mlot`lot'_parti=0	if mlot1_adm==1 | mlot1_inoth1==1 | mlot1_inoth2==1
		}
		foreach lot in  3 4 5{
		replace mlot`lot'_parti=0	if mlot2_adm==1 | mlot2_inoth1==1 | mlot2_inoth2==1
		}
		foreach lot in  4 5{
		replace mlot`lot'_parti=0	if mlot3_adm==1 | mlot3_inoth1==1 | mlot3_inoth2==1
		}		
		foreach lot in  5{
		replace mlot`lot'_parti=0	if mlot4_adm==1 | mlot4_inoth1==1 | mlot4_inoth2==1
		}		
	*Blank out lottery information if didn't participate in that lottery	
		foreach lot in 2 3 4 5{	
		foreach vv in term adm rej alt inoth1 inoth2 imprej{
			replace mlot`lot'_`vv'=. if mlot`lot'_parti==0
			}
			}		
		
	*impute lottery term if lottery term missing (20XX.7 goes to 20XX+0.4, and 20XX.1 goes to 20XX+0.6).
		foreach z in 2 3 4 5{
			tab mlot`z'_term
			local y=`z'-1
			gen dropme=mlot`y'_term-floor(mlot`y'_term)
			tab dropme
				replace dropme=0.1 if dropme>0 & dropme<0.11
				replace dropme=0.7 if dropme>0.6 & dropme<0.71
			replace mlot`z'_term=mlot`y'_term+0.4 if mlot`z'_term==0 & dropme>0.6 & dropme<0.8
			replace mlot`z'_term=mlot`y'_term+0.6 if mlot`z'_term==0 & dropme>0.05 & dropme<0.15
			drop dropme
			tab mlot`z'_term
			}
			
	
	*Impute missing lottery REJECTIONS for certain first-application terms
		*Step1: dummies for people whose values I impute
		*2005.1: 2 & 4
		gen mlot2_imputemissing= mlot3_par==1 & mlot1_term>2005.0 & mlot1_term<2005.2 & mlot2_term==.
		gen mlot4_imputemissing= mlot5_par==1 & mlot1_term>2005.0 & mlot1_term<2005.2 & mlot4_term==.
		*2005.7: 1, 3, 5
		gen mlot1_imputemissing= mlot2_par==1 & mlot1_term>2005.6 & mlot1_term<2005.8 & (mlot1_adm+ mlot1_rej+ mlot1_alt+ mlot1_inoth1+mlot1_inoth2+mlot1_imprej)==0
		gen mlot3_imputemissing= mlot4_par==1 & mlot1_term>2005.6 & mlot1_term<2005.8 & mlot3_term==.
		*2006.1: 2, 4
		replace mlot2_imputemissing= 1 if mlot3_par==1 & mlot1_term>2006.0 & mlot1_term<2006.2 & mlot2_term==.
		replace mlot4_imputemissing= 1 if mlot5_par==1 & mlot1_term>2006.0 & mlot1_term<2006.2 & mlot4_term==.
		*2006.7: 1, 3
		replace mlot1_imputemissing= 1 if mlot2_par==1 & mlot1_term>2006.6 & mlot1_term<2006.8 & (mlot1_adm+ mlot1_rej+ mlot1_alt+ mlot1_inoth1+mlot1_inoth2+mlot1_imprej)==0
		replace mlot3_imputemissing= 1 if mlot4_par==1 & mlot1_term>2006.6 & mlot1_term<2006.8 & mlot3_term==.
		*2007.1: 2
		replace mlot2_imputemissing= 1 if mlot3_par==1 & mlot1_term>2007.0 & mlot1_term<2007.2 & mlot2_term==.
		*2007.7: 1
		replace mlot1_imputemissing= 1 if  mlot2_par==1 & mlot1_term>2007.6 & mlot1_term<2007.8 & (mlot1_adm+ mlot1_rej+ mlot1_alt+ mlot1_inoth1+mlot1_inoth2+mlot1_imprej)==0
		*2008.7: 1
		replace mlot1_imputemissing= 1 if mlot2_par==1 & mlot1_term>2008.6 & mlot1_term<2008.8 & (mlot1_adm+ mlot1_rej+ mlot1_alt+ mlot1_inoth1+mlot1_inoth2+mlot1_imprej)==0

		*Step 2: Impute that they participated and were rejected.
		gen mlot1_par=.
			foreach lot in 1 2 3 4{
			replace mlot`lot'_par=1 if mlot`lot'_imputemissing==1
			replace mlot`lot'_rej=1 if mlot`lot'_imputemissing==1
			replace mlot`lot'_adm=0 if mlot`lot'_imputemissing==1
			replace mlot`lot'_alt=0 if mlot`lot'_imputemissing==1
			replace mlot`lot'_inoth1=0 if mlot`lot'_imputemissing==1
			replace mlot`lot'_inoth2=0 if mlot`lot'_imputemissing==1
			replace mlot`lot'_imprej=0 if mlot`lot'_imputemissing==1
			}		
		drop mlot1_par
			

		*Create some mutually exclusive categories
			foreach lot in 1 2 3 4 5{
				gen mlot`lot'_cat=1 if mlot`lot'_adm==1
				replace mlot`lot'_cat=2 if mlot`lot'_rej==1
				replace mlot`lot'_cat=3 if mlot`lot'_alt==1
				replace mlot`lot'_cat=4 if mlot`lot'_inoth1==1
				replace mlot`lot'_cat=5 if mlot`lot'_inoth2==1 | mlot`lot'_alto==1
				}
			label define mlot  1 "Admit" 2 "Reject" 3 "Alt" 4 "InOth1"	5 "InOth2"
				label values mlot1_cat mlot
				label values mlot2_cat mlot
				label values mlot3_cat mlot
				label values mlot4_cat mlot
				label values mlot5_cat mlot
				

keep stid firstapp_term  mlot*	
duplicates drop

*participated in multiple rounds of lotteries?
bysort stid: gen numlotsessions=_N
bysort stid: egen firstlotsession=min(firstapp)
bysort stid: egen lotsessnum=rank(firstapp)


	***SAVE THIS 
	save $analy/temp/lottery_nurseCENCOL_flat_new, replace

		


















/*
*move the date variable over to the following observation, provided it's for the same person
gen ff=forward_term[_n-1] if stid==stid[_n-1] & firsta==firsta[_n-1]
	rename forward_term unforwarded_term
	rename ff forward_term
		*the first application (applp_num==1) gets the first term application number
		*first, destring it
		gen forwardn_term=0.7 if substr(forward_t,1,4)=="FALL"
			replace forward_term=subinstr(forward_term,"FALL","",.)
		replace forwardn_term=0.1 if substr(forward_term,1,6)=="SPRING"
			replace forward_term=subinstr(forward_term,"SPRING","",.)
		replace forwardn_term=0.3 if substr(forward_term,1,6)=="SUMMER"
			replace forward_term=subinstr(forward_term,"SUMMER","",.)
		
			*97% of cases have a viable term number, but blank out the ones that don't
				replace forward_term="" if substr(forward_term,1,4)=="CLOS"
				replace forward_term="" if substr(forward_term,1,1)=="A"
				replace forward_term="" if substr(forward_term,1,1)=="D"
				replace forward_term="" if substr(forward_term,1,1)=="R"
		destring forward_term, gen (tempyearapp) ignore("FILECLOSEDNR")
			replace forwardn_term=tempyearapp+forwardn_term
			drop forward_term tempyear

	*the first occurrence of every stid-"first-app" combo gets the first-app date.
		bysort stid firstap: egen rank=rank(order)
			replace forwardn_term=firstap if rank==1
		
	*figure out the REAL first application of this person
		bysort stid: egen realfirstapp_term=min(forwardn_term)
	
	*A small group of people have missing application date (Lots also have missing outcome info, too)
		gen missing_term=forwardn_term==.
		tab missing_term
		tab lot_cat if missing_term, m
		gen fixed=.

		*Unfortunately, some manual recodes due to typos in the data.
			*Recode ones where the previous record is "CLOSEFILENR", then drop if record is "CLOSEFILENR"
			sort stid order
			replace unfo="CLOSEFILENR" if unfo=="CLOSESR"
			replace unfo="CLOSEFILENR" if unfo=="ACCEPT"
			replace unfo="CLOSEFILENR" if unfo=="ADMINCLOSE"
			replace unfo="CLOSEFILENR" if unfo=="CLOSED"
			replace unfo="CLOSEFILENR" if unfo=="CLOSENR"
			replace unfo="CLOSEFILENR" if unfo=="CLOSEFILE"
			replace unfo="CLOSEFILENR" if unfo=="CLOSEFILE-SR"
			replace unfo="CLOSEFILENR" if unfo=="CLOSEFILESR"
			replace unfo="CLOSEFILENR" if unfo=="FILECLOSEDNR"
			

			*list if (missing==1 & unfo[_n-1]=="CLOSEFILENR")|(missing[_n+1]==1 & unfo=="CLOSEFILENR")
				replace forwardn_term=forwardn_term[_n-1] if forwardn_term==. & missing==1 & unfo[_n-1]=="CLOSEFILENR"
				replace lot_catc=lot_catc[_n-1] if lot_catc=="" & missing==1 & unfo[_n-1]=="CLOSEFILENR"
				foreach v of varlist lot_u lot_se lot_al lot_cat{
					replace `v'=`v'[_n-1] if `v'==. & missing==1 & unfo[_n-1]=="CLOSEFILENR"
					}
				replace fixed=1 if missing==1 & forwardn_term!=.
				replace missing=0 if forwardn_term!=.
				drop if fixed[_n+1]==1 & unfo=="CLOSEFILENR"
			*there are also a few where the previous entry is SELECTED and the following entry is problematic. drop those.
				drop if missing==1 & lot_catc[_n-1]=="S"
	
		
			do "$dir/build/prog/CENCOLlottery/CENCOL_manualrecodes.do"
**************************************************************************************************			
		*codes: 2=wouldve been alternate in the missing year
			*  136409 , 299004, 475551		
		*codes: wouldve been selected in the missing year
			/*37445, 462150*/	
****************************************************************************************************/
************************************************************		
	*Flatten it out, with one variable for each lottery they enter (date and outcome)
		ddddd
		*Replace lottery outcome as "Unselected" if lottery outcome is missing (this may seem wrong, but a comprehensive look at the raw data suggests typos. Especially in 2013.1 and 2011.1).
		gen lot_cat_imputed=lot_cat==.
		replace lot_cat=2 if lot_cat==.
		
		
		*drop if year and outcome is the same
		duplicates drop stid lastname firstname forwardn_term lot_cat, force
		
		
		*Lottery-by-lottery outcomes
			*Important detail is that if students fail to forward their application to the next term, they must reapply from scratch
			*If they reapply from scratch, then they start from 0 into the "5th lottery guarantee".
		gen lotapp_firstever=forwardn_term==realfirstapp
		gen lotapp_firstinser=forwardn_term==firstapp
		*how many times did they start over?
		bysort stid: egen numstart=total(lotapp_firstinser)
		*The forward number is correct. The problem is that we are missing some rows of data in years prior to 2009.
			replace forward_num=1 if forward_num==0
			bysort stid: egen numlott=max(forward_num)
		
		
		
		
		
		gen lotterm_1=forwardn_term if lotapp_firstever
		
		*Ever make it in?
		gen everin_flag=1 if lot_cat==1
		bysort stid: egen everin_term=min(forwardn_term) if everin_flag==1
		bysort stid: egen everin_lot=min(forward_num) if everin_flag==1
	
		gen everalt_flag=1 if lot_cat==3
		bysort stid: egen everalt_term=min(forwardn_term) if everalt_flag==1
		bysort stid: egen everalt_lot=min(forward_num) if everalt_flag==1
		
		gen otherin_flag=in_otherway==1
		bysort stid: egen otherin_term=min(forwardn_term) if otherin_flag==1
		bysort stid: egen otherin_lot=min(forward_num) if otherin_flag==1

		gen inceverin_flag=1 if everin_flag==1 | otherin_flag==1
			bysort stid: egen inceverin_term=min(forwardn_term) if inceverin_flag==1
			bysort stid: egen inceverin_lot=min(forward_num) if inceverin_flag==1
		
		gen altinceverin_flag=1 if everin_flag==1 | otherin_flag==1 | everalt_flag==1
			bysort stid: egen altinceverin_term=min(forwardn_term) if altinceverin_flag==1
			bysort stid: egen altinceverin_lot=min(forward_num) if altinceverin_flag==1
				
		
				foreach v of varlist everin* everalt* otherin* inceverin* altincever*{
				bysort stid: egen trash=mean(`v')
				replace `v'=trash
				drop trash
				}

				replace everin_flag=everin_flag==1
				replace everalt_flag=everalt_flag==1
				replace otherin_flag=otherin_flag==1
				replace inceverin_flag=inceverin_flag==1
		

		
		
	*Flags for what lottery you got in on.
		gen lotwin_1=everin_flag==1 & everin_lot==1
		gen lotwin_12=(everin_flag==1 & everin_lot==2) | lotwin_1==1
		gen lotwin_123=(everin_flag==1 & everin_lot==3) | lotwin_12==1
		gen lotwin_1234=(everin_flag==1 & everin_lot==4) | lotwin_123==1


		gen lotwin_inc_1=inceverin_flag==1 & inceverin_lot==1
		gen lotwin_inc_12=(inceverin_flag==1 & inceverin_lot==2) | lotwin_inc_1==1
		gen lotwin_inc_123=(inceverin_flag==1 & inceverin_lot==3) | lotwin_inc_12==1
		gen lotwin_inc_1234=(inceverin_flag==1 & inceverin_lot==4) | lotwin_inc_123==1
		
		gen lotwin_altinc_1=altinceverin_flag==1 & altinceverin_lot==1
		gen lotwin_altinc_12=(altinceverin_flag==1 & altinceverin_lot==2) | lotwin_altinc_1==1
		gen lotwin_altinc_123=(altinceverin_flag==1 & altinceverin_lot==3) | lotwin_altinc_12==1
		gen lotwin_altinc_1234=(altinceverin_flag==1 & altinceverin_lot==4) | lotwin_altinc_123==1		


		
	*make the names all uppercase
		replace lastn=upper(lastn)
		replace firstn=upper(firstn)
	*drop the nonflat variables
		drop program grant* program_a order  lot_* forwardn rank forward_num forwardtoterm unforwarded firstapp fixed missing 
		duplicates drop
			/*still a few duplicates in terms of first and last name*/
				*save their alternate names (maiden names, likely)
				duplicates tag stid, gen(multiname)
				bysort stid: gen order=_n
				gen altname_first=""
					replace altname_first=firstname[_n+1] if order==1 & multiname>0
				gen altname_last=""
					replace altname_last=lastname[_n+1] if order==1 & multiname>0
				drop if order>1	
		
	*****************************		
	*should be no duplicates now
	****************************
		duplicates report stid
	***SAVE THIS SUCKER
	save $analy/temp/lottery_nurseCENCOL_flat, replace
