* Take cleaned data and create variables needed for demand/supply/entry/counterfactuals

* age data missing in US for some reason
use data/USdata, clear
keep product t age
sort product t age
save data/USages, replace
*

use data/demand-first-logits-checks-data-allage-keepimputep, clear //less sampling error keeps more hospital-months
*use demand-first-logits-checks-data-allage //has trustworthy price data
rename M MarketSize

gen mm=mod(t,12)
replace mm=12 if mm==0

*---------------------------------------------------------------------------
* need to designate those products that enter US during sample, and the time they were in trials pre-USentry
*---------------------------------------------------------------------------
bysort product US: egen enter = min(t) //selects first month for each product in US and EU
gen enterUS=enter if US==1
gen enterEU=enter if US==0
su enter enterUS enterEU US
gsort product -enterUS
by product: replace enterUS=enterUS[_n-1] if enterUS==.
gsort product -enterEU
by product: replace enterEU=enterEU[_n-1] if enterEU==.
*---------------------------------------------------------------------------
drop if US==0 //only need US data
*---------------------------------------------------------------------------
* age data missing in US for some reason
drop age
sort product t
merge 1:1 product t using data/USages
replace age=0 if age==.
bysort product: egen temp=min(age)
replace age = age+1 if temp==0
drop temp _merge
*---------------------------------------------------------------------------
drop if age==-999 //no products with no entry date found?!
* may want to remove age==1 observations if think delta understated
drop if age<=$agedrop
replace age=age-$agedrop
*
su lnd enter enterUS enterEU age 
su lnd enter enterUS enterEU age if age<36
*---------------------------------------------------------------------------


*---------------------------------------------------------------------------
* understand distribution of lifetime profits; impute where truncated
*---------------------------------------------------------------------------
* ages in sample
bysort product: egen minage=min(age)
bysort product: egen maxage=max(age)
* products who enter during sample period
gen pay=(enterUS>1) //(minage==1) misses a few who aren't observed every month
* observed profits in data (MRG~10% sample)
gen profit = q_r*p*10/1000000
sort product t
by product: gen profit_cumsum = sum(profit)
bysort product: egen profit_lifetime = max(profit_cumsum)
* adjust for missing data
gen ones=1
bysort product: egen Tobs=total(ones)
gen Tpct=Tobs/(maxage-minage+1)
replace profit_cumsum=profit_cumsum / Tpct
replace profit_lifetime=profit_lifetime / Tpct
* identify end truncated
gen age114=age if t==114
gsort product -age114
by product: replace age114=age114[_n-1] if age114==.
* identify beginning truncated
gen age1=age if t==1
gsort product -age1
by product: replace age1=age1[_n-1] if age1==.
* compute distribution for non-truncated observations
gen full=1 if age1==. & age114==.
gen pct_lifetime = profit / profit_lifetime if full==1
gen cum_pct_lifetime = profit_cumsum / profit_lifetime if full==1
* check distribution of various stats over products
preserve
collapse full enterUS age1 age114 profit profit_lifetime (min) minage=age (max) maxage=age (count) months=profit, by(product)
su full enterUS profit profit_lifetime minage maxage age1 age114
bysort full: su months profit, detail
bysort full: su months profit if profit_lifetime<50, detail
restore
* check relation between lifetime profits and age
preserve
keep if full==1 /*& maxage>20 & maxage<76*/
collapse (mean) avg_pct=cum_pct_lifetime (sd) sd_pct=cum_pct_lifetime (count) N=cum_pct_lifetime, by(age)
gen u=avg_pct+1.96*sd_pct/N^.5
gen l=avg_pct-1.96*sd_pct/N^.5
twoway (line avg_pct age if age<=48, lcolor(navy) lwidth(thick)) ///
 	(line u age if age<=48, lcolor(midblue) lpattern("-")) /// 
 	(line l age if age<=48, lcolor(midblue) lpattern("-")) ///
 	, legend( order( 1 "Mean Over Products" 2 "Standard Errors" ) rows(1) ) graphregion(color(white)) ///
	ytitle("Cumulative Percent Lifetime Profits") ylabel(0(.1)1) yscale(range(0 1)) ///
	xtitle("Age Since US Introduction (Months)") xlabel(0(12)48)
graph export "output/LifetimeProfits_Ages_US.pdf", replace 
restore
bysort age: egen avg_pct = mean(cum_pct_lifetime) if full==1
gsort age -avg_pct
by age: replace avg_pct=avg_pct[_n-1] if avg_pct==.
replace avg_pct=.99 if avg_pct==1
gen avg_pct48 = avg_pct if age==48
gsort -avg_pct48
replace avg_pct48=avg_pct48[_n-1] if avg_pct48==.
* impute end truncated
replace profit_lifetime = profit_lifetime / avg_pct if age114<48 & age114!=. & age==114
replace profit_lifetime = profit_lifetime / avg_pct48 if age114>=48 & age114!=. & age==114
bysort product: egen temp=max(profit_lifetime)
replace profit_lifetime=temp
drop temp
* impute beginning truncated
replace profit_lifetime = profit_lifetime / (1-avg_pct) if age1<48 & age1!=. & age==1
replace profit_lifetime = profit_lifetime / (1-avg_pct48) if age1>=48 & age1!=. & age==1
bysort product: egen temp=max(profit_lifetime)
replace profit_lifetime=temp
drop temp
/*
* alternate approach using regression
su age, detail
gen last=1 if age>=maxage-12 & t<114-12
replace last=0 if last==.
gen yrs=0 if age<=12
replace yrs=6 if age>72
replace yrs=5 if age>60 & age<=72
replace yrs=4 if age>48 & age<=60
replace yrs=3 if age>36 & age<=48
replace yrs=2 if age>24 & age<=36
replace yrs=1 if age>12 & age<=24
su yrs last
sort product -age
by product: gen profit_next = profit[_n-1]
gen age2 =  yrs*yrs
gen age_profit = yrs*profit
gen age2_profit = age2*profit
reg last yrs age2 profit if age>6 & age<102
reg profit_next profit age_profit age2_profit if age>6 & age<102
reg last yrs age2 profit if profit_lifetime<25 & age>6 & age<102
reg profit_next profit age_profit age2_profit if profit_lifetime<25 & age>6 & age<102
drop yrs last profit_next age2 age_profit age2_profit
*/
* distribution imputed
preserve
collapse profit_lifetime, by(product)
su profit_lifetime, detail
histogram profit_lifetime, width(25) start(0) fraction ///
 	graphregion(color(white)) ///
	ytitle("Fraction of Products") ///
	xtitle("Lifetime Profits ($ Millions)") 
graph export "output/LifetimeProfits_Products_US.pdf", replace 
su profit_lifetime if profit_lifetime>25
histogram profit_lifetime if profit_lifetime>25, width(25) start(0) fraction ///
 	graphregion(color(white)) ///
	ytitle("Fraction of Products (conditional on >$25M)") ///
	xtitle("Lifetime Profits ($ Millions)") 
graph export "output/LifetimeProfits_Products_gt25M_US.pdf", replace 
restore
*---------------------------------------------------------------------------

save data/USdata_full, replace

*---------------------------------------------------------------------------
* some restrictions on sample because some product-months do not have variation to be identified
*---------------------------------------------------------------------------
* may not want to use products where only observe old age b/c ageFE and prodFE colinear
*drop if minage>=36
*can only use products with Tj>2 for analysis (bootstrap over months)
*gen ones=1
gen old=(age>36)
bysort product old: egen Tj=total(1)
keep if Tj>=$Tj_min // make sure can estimate fixed effect 
*keep if Tj36>2 // make sure can estimate fixed effect in restricted GMM sample
*replace age=36 if age>=36

/*
preserve

keep if DES==1
keep if USeventual==1
egen prod=group(product)
sort prod
quietly tab prod, gen(jFE)
sort t prod
outsheet prod lnd age t profit_lifetime pay MarketSize p DES finishfactor jFE* using EUdata_DES_US.csv, replace

restore, preserve
*/

*---------------------------------------------------------------------------
* fillin missing months?

*---------------------------------------------------------------------------

*---------------------------------------------------------------------------
* create dummies and output for analysis
*---------------------------------------------------------------------------
egen prod=group(product)
sort prod
*quietly tab prod, gen(jFE)
* save file for analysis
sort t prod
outsheet prod lnd age t profit_lifetime pay MarketSize p DES using USdata_agg_raw.csv, replace
save data/USdata_agg_raw, replace
*outsheet prod lnd age clinical USlater t profit_lifetime pay MarketSize p DES USeventual jFE* using EUdata_p.csv, replace
*save EUdata_p, replace



