#delimit;
version 7;
set mem 1000m;
set matsize 800;
log using AEJ-main-program-to-share-Campylobacter.log, replace;

* this program replicates AEJ-main-program-to-share-poisonA.do but
* defines foodborne hospitalization as Campylobacter only;

* include Campylobacter ICD-9-code 008.43 (1979-1998, used up to 2015),
                        ICD-10-code A04.5 (1999 to present);

global niccohospdata "~/nicco-stuff/la_restaurants/data/hosp-discharge";
global tmpSTAT "~/nicco-stuff/tmpSTAT";
global niccocleandata "~/nicco-stuff/la_restaurants/data/cleandata";

* step 1: -------------- read in disease code from our medical expert ------------;

* The original code file also includes journal code from the March 2000 Environmental Health article 
"Food-related illness and death in the United States". We do not use journal code for regression, so ignore it here;

insheet using disease-code-for-AEJ-share.csv, names comma;
des;
sum;
gen str8 pdiag=substr(dis_code, 2,.);
replace pdiag=substr(pdiag,1,3)+substr(pdiag,5,.) if index(pdiag, ".")~=0;
list dis_code pdiag; /* check if I have converted the code correctly*/;
tab expert_dum;
	 * should keep those expert_dum=0 because they are exceptions that we should rule out;
gen poisonH=(pdiag=="00843"); * poisonH to indicate Campylobacter, per suggestions from Ho et al.;
tab pdiag if poisonH==1;
keep pdiag expert_code expert_dum poisonH;
sort pdiag;
save $tmpSTAT/expert-code, replace;

* note that three-digit, four-digit and five-digit diagnosis codes need separate treatment;

use $tmpSTAT/expert-code;
keep if length(pdiag)==3;
rename pdiag pdiag3;
rename expert_code expert3code;
sort pdiag3;
save $tmpSTAT/expert3code, replace;
list pdiag; 
list pdiag if poisonH==1;

use $tmpSTAT/expert-code, clear;
keep if length(pdiag)==4;
rename pdiag pdiag4;
rename expert_code expert4code;
sort pdiag4;
save $tmpSTAT/expert4code, replace;
list pdiag;
list pdiag if poisonH==1;

use $tmpSTAT/expert-code, clear;
keep if length(pdiag)==5;
rename pdiag pdiag5;
rename expert_code expert5code;
sort pdiag;
save $tmpSTAT/expert5code, replace;
list pdiag;      
list pdiag if poisonH==1;
     
* step 2 ------read in hospital discharge data (at individual level) and define expert's poison classification A/B/C ----;

* raw discharge data is at individual level, with diagnosis code, county code and zip3;

use $tmpSTAT/disch95;
append using $tmpSTAT/disch96;
append using $tmpSTAT/disch97;
append using $tmpSTAT/disch98;
append using $tmpSTAT/disch99;

des;
sum;

tab year;
drop if year<1995;
tab county;
egen ncase=count(year), by(year county);
drop if ncase<=500;
tab county;
drop ncase;

* counts of food poisoning incidence;
* merge with expert code;

gen str3 pdiag3=substr(pdiag,1,3);
gen str4 pdiag4=substr(pdiag,1,4);
gen str5 pdiag5=substr(pdiag,1,5);
sort pdiag3;
merge pdiag3 using $tmpSTAT/expert3code;
tab _merge;
drop if _merge==2;
rename _merge _merge3;
sort pdiag4;
merge pdiag4 using $tmpSTAT/expert4code, update replace;
tab _merge;
drop if _merge==2;
rename _merge _merge4;
tab _merge3 _merge4;
tab pdiag expert4code if _merge4>=3;
tab pdiag expert3code if _merge4>=3, missing;
tab expert3code expert4code if _merge4>=3, missing;

sort pdiag5;
merge pdiag5 using $tmpSTAT/expert5code, update replace;
tab _merge;
drop if _merge==2;
rename _merge _merge5;
tab _merge3 _merge5;   
tab _merge4 _merge5;
sum poisonH;
list pdiag pdiag5 expert5code poisonH if poisonH==1;
tab pdiag expert5code if _merge5>=3;
tab pdiag expert3code if _merge5>=3, missing;
tab pdiag expert4code if _merge5>=3, missing;
tab expert3code	expert5code if _merge5>=3, missing;
tab expert4code expert5code if _merge5>=3, missing;

gen str1 expert_code=expert3code;
replace expert_code=expert4code if expert_code=="" & expert4code~="";
replace expert_code=expert5code if expert_code=="" & expert5code~="";

* there could be cases where expert3code, expert4code or expert5code conflict;
* in that case we should take the latter code;
* for example, 005 is A but 0050 0051 0052 0054 0058 and 00581 should be excluded from food;

tab expert_code;
tab expert3code expert4code, missing;
tab expert3code expert5code, missing;
tab expert4code expert5code, missing;
replace expert_code=expert4code if expert3code~="" & expert4code~="" & expert5code=="" & expert3code~=expert4code;
replace expert_code=expert5code if expert5code~="" & (expert3code~="" & expert3code~=expert5code);
replace expert_code=expert5code if expert5code~="" & (expert3code=="" & expert4code~="" & expert4code~=expert5code);
tab expert_code;
 
replace expert_code="N" if expert_code=="" | expert_code=="0";

tab year expert3code;
tab year expert4code;
tab year expert5code;
tab year expert_code;
label variable expert_code "A >90% B 50-90% C 10-50% N <10%";      

tab year poisonH;

gen poisonA=expert_code=="A";
gen poisonB=expert_code=="B";
gen poisonC=expert_code=="C";
gen poisonAB=poisonA==1 | poisonB==1;
gen poisonABC=poisonA==1 | poisonB==1 | poisonC==1;
gen poisonAH= poisonA==1 | poisonH==1;
tab pdiag if poisonA==1;
tab pdiag if poisonB==1;
tab pdiag if poisonC==1;
tab pdiag if poisonH==1;
tab pdiag if poisonAH==1;
* some poisonH might be missing because of merge update;
sum poison*;
replace poisonH=0 if poisonH==.;
replace poisonAH=0 if poisonAH==.;

        /* note the county code in discharge data is not fips cty code*/;

rename zip3 zipcode3;
gen zip3=real(zipcode3);
tab year zipcode3 if zip3==.;
drop zipcode3;
drop if zip3==.;
sort year month zip3;
save $tmpSTAT/disch95to99.dta, replace;

* step 3 ------------- merge individual hospital discharge data with regulation data ------------------;

/* regulaton data ca-reg-zip3.dta has been created by 
(1) merging $niccocleandata/city.startdate.dta and $niccohospdata/la-zipcode-locctyid.csv for LA zipcodes
(2) appending with $niccohospdata/ca-zipcode.csv and
(3) collapsing zip5 into zip3 weighted by zip 5 population;

* make distinction of LA and non-LA, 
        For zipcodes completely in non-LA counties, take the unique zipcode. 
        For zipcodes partially in LA county, include it in LA county file and 
                calculate weighted regulation with weights equal to pop in each place-zipcode;

* here is how I define LA regulation
gen sub_reg=0;
replace sub_reg=1 if fipscty==6037 & (locctyid~=154 | locctyid~=81 |  locctyid~=106);
                /* all cities in LA county except long beach, vernon and pasadena are sub to LA cty reg*/; 

* generate card_yes and card_vol by year month and zip3;
* because a zip3 may include several cities who adopt the regulation at 
different time, we calculate the average card_yes and card_vol by year month 
and zip3 weighted by population in each zip5 and city within that zip3;

* here is how I define LActy
gen LActy=0;
replace LActy=1 if fipscty==6037;

* here is how I define Southern CA counties excluding LA
tab fipscty;
* sountern CA includes county Imperial(6025), Kern (6029), Organge (6059), San Bernardino (6071), San Diego (6073), 
San Luis Obispo (6079), Santa Barbara (6083), Riverside (6065), Ventura (6111);
gen SouthCA_noLA=fipscty==6025|
		fipscty==6029|
		fipscty==6059|
		fipscty==6071|
		fipscty==6073|
		fipscty==6079|
		fipscty==6083|
		fipscty==6065|
		fipscty==6111;
tab fipscty, sum(SouthCA_noLA);

* the key variables are LActy, per_reg, card_yes and card_vol

label variable LActy "% of pop belong to LA cty";
label variable per_reg "% of pop subject to LA cty reg by zip3";
label variable t_pop "total population in zip3";
label variable card_yes "% of days in year-month-zip3 subject to LA's mandatory grade cards";
label variable card_vol "% of days in year-month-zip3 subject to LA's voluntary grade cards";

*/;


use $tmpSTAT/disch95to99.dta, replace;
sort year month zip3;
merge year month zip3 using ca-reg-zip3;
tab _merge;
sort year month zip3;
list year month zip3 if _merge==1 & (year~=year[_n-1] | month~=month[_n-1] | 
zip3~=zip3[_n-1]);
drop if _merge==1; /* these are the tourists from other areas who got sick in 
California */;
drop if _merge==2;
drop _merge;

* the hosp data only have admission year and month, to be safe assume everyone is admitted in the first day of the month;

gen cdyes_hyg=card_yes*poisonA;
gen cdvol_hyg=card_vol*poisonA;

gen cdyes_hygH=card_yes*poisonH;
gen cdvol_hygH=card_vol*poisonH;

gen cdyes_hygAH=card_yes*poisonAH;
gen cdvol_hygAH=card_vol*poisonAH;

label variable card_yes "% of mandatory posting grade card in LA cty";
label variable card_vol "% of voluntary posting grade card in LA cty";
label variable cdyes_hyg "=1 if mandatory posting grade card in LA cty & 
related to food poisoning";
label variable cdvol_hyg "=1 if voluntary posting grade card in LA cty & 
related to food poisoning";

save $tmpSTAT/complete-hosp-discharge,replace;

* step 4 -- collapse the merged data into year-month-zip3 and diaggrp (poison A versus non-poisonA)

* generate a complete list of year month zip3 poisonA, poisonH and others;

use $tmpSTAT/ca-reg-zip3, clear;
expand 3;
sort year month zip3;
quietly by year month zip3: gen poisonA=_n-1;
tab poisonA; /* should be equally 0, 1 or 2*/;
gen poisonH=poisonA==2;
replace poisonA=0 if poisonA==2;
tab poisonA poisonH;
        * should have one third with both being zero,
                one third with only poisonA=1, and one third with only poisonH=1;
gen poisonAH=poisonA==1 | poisonH==1;

gen poisonAH_3grp=0;
replace poisonAH_3grp=1 if poisonA==1;
replace poisonAH_3grp=2 if poisonH==1;
tab poisonAH_3grp, sum(poisonA);
tab poisonAH_3grp, sum(poisonH);
sum poison*;
sort year month zip3 poisonAH_3grp;
save $tmpSTAT/ca-reg-zip3-poisonA-poisonH, replace;

* collapse individual hospital discharge data into year-month-zip3;

use $tmpSTAT/complete-hosp-discharge, clear; 

tab ethn, gen(ethn_);
tab race, gen(race_);
tab paysrc, gen(pay_);
gen female=1 if sex==2;
replace female=0 if female==.;
gen age=agey+aged/365;

gen poisonAH_3grp=0;
replace poisonAH_3grp=1 if poisonA==1;
replace poisonAH_3grp=2 if poisonH==1;
tab poisonAH_3grp, sum(poisonA);
tab poisonAH_3grp, sum(poisonH);

collapse (count) n_diaggrp=age (sum) sumlos=los sumchrg=tcharge (mean) avglos=los avgchrg=tcharge ethn_* race_* pay_* female 
age card_* LActy per_reg t_pop poisonA poisonH poisonAH, by(year month zip3 poisonAH_3grp);
sort year month zip3;

* merge with ca-reg-zip3 to account for those zip3s that do not have any 
hospitalization episode at a year+month+poisonA combination;

sort year month zip3 poisonAH_3grp;
merge year month zip3 poisonAH_3grp using $tmpSTAT/ca-reg-zip3-poisonA-poisonH;          
tab _merge;
replace n_diaggrp=0 if _merge==2; /* if a zip3 does not occur in discharge data, means zero incidence*/;
drop _merge;
des;
sum;

tab year, gen(yr_);
tab month, gen(month_);

gen cdyes_hyg=card_yes*poisonA;
gen cdvol_hyg=card_vol*poisonA;

gen cdyes_hygH=card_yes*poisonH;
gen cdvol_hygH=card_vol*poisonH;

gen cdyes_hygAH=card_yes*poisonAH;
gen cdvol_hygAH=card_vol*poisonAH;

global ind_demo
ethn_* race_* age pay_* female;

global yymm
yr_* month_* /*diag_**/;

*global card
poisonA card_* cdyes_hyg cdvol_hyg per_reg t_pop LActy;

global card
poisonA poisonH card_* cdyes_hyg cdvol_hyg cdyes_hygH cdvol_hygH per_reg t_pop LActy;

gen zip3diag=zip3*1000+(poisonA*1+poisonH*2);      
iis zip3diag;

gen ndiag_p=n_diaggrp/t_pop;
gen los_p=sumlos/t_pop;
gen chrg_p=sumchrg/t_pop;

gen ln_npois=ln(n_diaggrp);
gen lnpois_p=ln(ndiag_p);
gen lnsumlos=ln(sumlos);
gen lnlos_p=ln(los_p);
gen lnchrg=ln(sumchrg);
gen lnchrg_p=ln(chrg_p);
gen lnavglos=ln(avglos);
gen lnavgchg=ln(avgchrg);


* regressions: For each dep var: 
                OLS with $card
                FE with $card;

gen lnn_diaggrp=ln(n_diaggrp+1); /* take into account those n_diaggrp=0 */;
* adding an interaction suggested by Ho et al;
tab poisonA year;
tab poisonH year;
tab poisonA year if LActy>0, sum(lnn_diaggrp);
tab poisonH year if LActy>0, sum(lnn_diaggrp);
tab poisonA year if LActy==0, sum(lnn_diaggrp);
tab poisonH year if LActy==0, sum(lnn_diaggrp);
tab poisonA year if SouthCA_noLA>0, sum(lnn_diaggrp);
tab poisonH year if SouthCA_noLA>0, sum(lnn_diaggrp);
tab poisonA year if LActy==0 & SouthCA_noLA==0, sum(lnn_diaggrp);
tab poisonH year if LActy==0 & SouthCA_noLA==0, sum(lnn_diaggrp);

gen foodaft1998=poisonA*(year>=1998);
gen foodaft1998H=poisonH*(year>=1998);

* now ready for regression, always cluster by zip3diag;

* the following codes define poisonH=foodborne hospitalization
* results are reported in Table 5 columns 7-8, see comments below

* pretreatment test and year-by-year effects;
* results using different geographic control groups;
*-------------------------------------------------------------------;

gen treatedLA=LActy*poisonA;
gen treatedLA_95=treatedLA*(year==1995);
gen treatedLA_96=treatedLA*(year==1996);
gen treatedLA_97=treatedLA*(year==1997);
gen treatedLA_98=treatedLA*(year==1998);
gen treatedLA_99=treatedLA*(year==1999);

gen treatedLA_H=LActy*poisonH;
gen treatedLA_H95=treatedLA_H*(year==1995);
gen treatedLA_H96=treatedLA_H*(year==1996);
gen treatedLA_H97=treatedLA_H*(year==1997);
gen treatedLA_H98=treatedLA_H*(year==1998);
gen treatedLA_H99=treatedLA_H*(year==1999);

* note that LActy can have decimal points;
* test using LActy>0;

* include both food and nonfood observations;

* all CA up to 97 vs all years separately;
*-----------------------------------------;
* now on poisonH only;
*~~~~~~~~~~~~~~~~~~~~~;

areg lnn_diaggrp $yymm /*treatedLA_95*/ treatedLA_96 treatedLA_97 treatedLA_H96 treatedLA_H97
if year<=1997 & poisonA~=1, a(zip3diag) cluster(zip3diag);
test treatedLA_96 treatedLA_97;
test treatedLA_H96 treatedLA_H97;

* Table 5 Column 7, panel A;

areg lnn_diaggrp $yymm /*treatedLA_95*/ treatedLA_96 treatedLA_97 treatedLA_98 treatedLA_99
treatedLA_H96 treatedLA_H97 treatedLA_H98 treatedLA_H99 if poisonA~=1, a(zip3diag) cluster(zip3diag);
test treatedLA_96 treatedLA_97;
test treatedLA_98 treatedLA_99;
test treatedLA_H96 treatedLA_H97;
test treatedLA_H98 treatedLA_H99;

* J&L specification as in 2003 paper -- Table 5, column 7, panel B;
areg lnn_diaggrp $card $yymm if poisonA~=1, a(zip3diag) cluster(zip3diag) robust;

* add the interaction suggested by Ho et al -- Table 5, column 7, panel C;
areg lnn_diaggrp $card $yymm foodaft1998 foodaft1998H if poisonA~=1, a(zip3diag) cluster(zip3diag) robust;


* LA and SouthCA_noLA up to 97 vs. all years separately;
*---------------------------------------------------------;

* now on poisonH only;
*~~~~~~~~~~~~~~~~~~~~~;
areg lnn_diaggrp $yymm /*treatedLA_95*/ treatedLA_96 treatedLA_97 treatedLA_H96 treatedLA_H97 
if year<=1997 & poisonA~=1 & (LActy>0|SouthCA_noLA>0), a(zip3diag) cluster(zip3diag);
test treatedLA_96 treatedLA_97;
test treatedLA_H96 treatedLA_H97;

areg lnn_diaggrp $yymm /*treatedLA_95*/ treatedLA_96 treatedLA_97 treatedLA_98 treatedLA_99
treatedLA_H96 treatedLA_H97 treatedLA_H98 treatedLA_H99 if poisonA~=1 & (LActy>0|SouthCA_noLA>0), a(zip3diag) cluster(zip3diag);
test treatedLA_96 treatedLA_97;
test treatedLA_98 treatedLA_99;
test treatedLA_H96 treatedLA_H97;
test treatedLA_H98 treatedLA_H99;

* J&L specification as in 2003 paper;
areg lnn_diaggrp $card $yymm if poisonA~=1 & (LActy>0|SouthCA_noLA>0), a(zip3diag) cluster(zip3diag) robust;

* add the interaction suggested by Ho et al;
areg lnn_diaggrp $card $yymm foodaft1998 foodaft1998H if poisonA~=1 & (LActy>0|SouthCA_noLA>0), a(zip3diag) cluster(zip3diag) robust;


* LA and rest of CA excluding SouthCA up to 97 vs. all years separately;
*----------------------------------------------------------------------;

* now on poisonH only;
*~~~~~~~~~~~~~~~~~~~~~;

areg lnn_diaggrp $yymm /*treatedLA_95*/ treatedLA_96 treatedLA_97 treatedLA_H96 treatedLA_H97 
if year<=1997 & poisonA~=1 & (LActy>0|SouthCA_noLA==0), a(zip3diag) cluster(zip3diag);
test treatedLA_96 treatedLA_97;
test treatedLA_H96 treatedLA_H97;

* Table 5 column 8 panel A;
areg lnn_diaggrp $yymm /*treatedLA_95*/ treatedLA_96 treatedLA_97 treatedLA_98 treatedLA_99
treatedLA_H96 treatedLA_H97 treatedLA_H98 treatedLA_H99 if poisonA~=1 & (LActy>0|SouthCA_noLA==0), a(zip3diag) cluster(zip3diag);
test treatedLA_96 treatedLA_97;
test treatedLA_98 treatedLA_99;
test treatedLA_H96 treatedLA_H97;
test treatedLA_H98 treatedLA_H99;

* J&L specification as in 2003 paper -- Table 5 column 8 panel B;
areg lnn_diaggrp $card $yymm if poisonA~=1 & (LActy>0|SouthCA_noLA==0), a(zip3diag) cluster(zip3diag) robust;

* add the interaction suggested by Ho et al -- Table 5 column 8 panel C;
areg lnn_diaggrp $card $yymm foodaft1998 foodaft1998H if poisonA~=1 & (LActy>0|SouthCA_noLA==0), a(zip3diag) cluster(zip3diag) robust;



 
