***********************************
************************************
***MEASURES OF ASSOCIATION
************************************
************************************

************************************
***CLEAR MEMORY
************************************
clear all

************************************
***WINDOWS
************************************
***Start saving results window
log using "C:\course\programs\Stata04.log", replace text

***Shortcut for folders
global data   = "C:\course\data"
global output = "C:\course\output"

************************************
***MACINTOSH
************************************
***Start saving results window
log using "/course/programs/Stata04.log", replace text

***Shortcut for folders
global data   = "/course/data"
global output = "/course/output"

************************************
***OPENING COMMANDS
************************************
***Tell Stata to not pause for "more" messages
set more off

***Change directory
cd "$data"

***Open 2018 ACS (only Texas)
use "ACS2018TX.dta", clear

************************************
***GENERATE VARIABLES
************************************
***Sex
gen female=.
  replace female=0 if sex==1 // Male
  replace female=1 if sex==2 // Female

label define female 0 "Male" 1 "Female"
label values female female

***Race/ethnicity
gen raceth=.
  replace raceth=1 if race==1 & hispan==0 // White
  replace raceth=2 if race==2 & hispan==0 // Black
  replace raceth=3 if hispan>=1 & hispan<=4 // Hispanic
  replace raceth=4 if (race==4 | race==5 | race==6) & hispan==0 // Asian
  replace raceth=5 if race==3 & hispan==0 // Native American
  replace raceth=6 if (race==7 | race==8 | race==9) & hispan==0 // Other

label define raceth 1 "White" 2 "African American" 3 "Hispanic" ///
                    4 "Asian" 5 "Native American" 6 "Ohter races"
label values raceth raceth

***Age
egen agegr = cut(age), at(0,16,20,25,35,45,55,65,100)

label define agecode 0 "0-15" 16 "16-19" 20 "20-24" 25 "25-34" ///
                     35 "35-44" 45 "45-54" 55 "55-64" 65 "65-100"
label values agegr agegr

***Educational attainment
gen educgr=.
  replace educgr=1 if educ>=0 & educ<=5 // Less than high school
  replace educgr=2 if educ==6 // High school
  replace educgr=3 if educ==7 | educ==8 // Some college
  replace educgr=4 if educ==10 // College
  replace educgr=5 if educ==11 // 5+ years of college, graduate school

label define educgr 1 "Less than high school" 2 "High school" ///
                    3 "Some college" 4 "College" 5 "Graduate school"
label values educgr educgr

***Marital status
gen marital=.
  replace marital=1 if marst==1 | marst==2 // Married
  replace marital=2 if marst>=3 & marst<=5 // Separated, divorced, widowed
  replace marital=3 if marst==6 // Never married, single

label define marital 1 "Married" 2 "Separated, divorced, widowed" 3 "Never married"
label values marital marital

***Wage and salary income
gen income=.
  replace income=incwage if incwage!=999999

***Migration status
gen migrant=.
  replace migrant=1 if migrate1d==10 | migrate1d==23 // same house or within PUMA
  replace migrant=2 if migrate1d>=24 & migrate1d<=32 // internal migrant
  replace migrant=3 if migrate1d==40 // international migrant

label define migrant 1 "Non-migrant" 2 "Internal migrant" 3 "International migrant"
label values migrant migrant

***Internal migration status (domestic migration)
gen dommig=.
  replace dommig=0 if migrant==1 // non-migrant
  replace dommig=1 if migrant==2 // internal migrant
  
label define dommig 0 "Non-migrant" 1 "Internal migrant"
label values dommig dommig

tab migrant dommig, m

***International migration status
gen intmig=.
  replace intmig=0 if migrant==1 // non-migrant
  replace intmig=1 if migrant==3 // international migrant
  
label define intmig 0 "Non-migrant" 1 "International migrant"
label values intmig intmig

tab migrant intmig, m

************************************
***COMPLEX SAMPLE DESIGN
************************************
svyset cluster [pweight=perwt], strata(strata)

************************************
***CHI SQUARE
************************************
***It would be incorrect to use fweight,
***because you would get statistical significance
***by indicating to the test that you have
***more observations than what was actually collected

***Weights that preserve sample size (aweight, pweight, svy)
***are not allowed in Stata to estimate chi square

***Thus, estimate chi square without weights

************************************
***Migration status by sex
************************************
***Observed frequencies (fo)
tab migrant sex

***Expected frequencies (fe)
tab migrant sex, exp nofreq

***Use chi square from these tables
tab migrant sex, chi col
tab migrant sex, chi col nofreq

***Use column percentages from this table
tab migrant sex [fweight=perwt] // population size
tab migrant sex [fweight=perwt], col nofreq // percentage
tab migrant sex, m // missing cases

************************************
***Migration status by race/ethnicity
************************************
***Observed frequencies (fo)
tab migrant raceth

***Expected frequencies (fe)
tab migrant raceth, exp nofreq

***Use chi square from these tables
tab migrant raceth, chi col
tab migrant raceth, chi col nofreq

***Use column percentages from this table
tab migrant raceth [fweight=perwt] // population size
tab migrant raceth [fweight=perwt], col nofreq // percentage
tab migrant raceth, m // missing cases

************************************
***Migration status by education
************************************
***Observed frequencies (fo)
tab migrant educgr

***Expected frequencies (fe)
tab migrant educgr, exp nofreq

***Use chi square from these tables
tab migrant educgr, chi col
tab migrant educgr, chi col nofreq

***Use column percentages from this table
tab migrant educgr [fweight=perwt] // population size
tab migrant educgr [fweight=perwt], col nofreq // percentage
tab migrant educgr, m // missing cases

************************************
***SPEARMAN'S RHO (rank correlation coefficient)
************************************
***Education attainment by age group
tab educgr agegr, col
spearman educgr agegr

***Use column percentages from this table
tab educgr agegr [fweight=perwt] // population size
tab educgr agegr [fweight=perwt], col nofreq // percentage
tab educgr agegr [fweight=perwt], m // missing cases

************************************
***SCATTERPLOT - INCOME BY AGE
************************************
***Scatterplot without regression line
twoway scatter income age

***Scatterplot with regression line
twoway scatter income age || lfit income age if income!=0, ///
	   ytitle(Wage and salary income) xtitle(Age)

twoway (scatter income age) (lfit income age) if income!=0, ///
	   ytitle(Wage and salary income) xtitle(Age)
	   
***Save graph
graph export "$output/age-income_scatter.png", replace //Macintosh
graph export "$output\age-income_scatter.png", replace //Windows

***Regression coefficients
***Least-squares regression model
***They can be reported in the footnote of the scatterplot
***Income <- Age
svy, subpop(if income!=. & income!=0): reg income age

************************************
***LINE GRAPH - MEAN INCOME BY AGE
************************************
***Generate variable with mean income by age
bysort age: egen mincage=mean(income) if income!=0
sum mincage, d

***Line graph of income by age
twoway line mincage age [fweight=perwt], ///
  ytitle("Mean wage and salary income") ylabel(0(20000)80000)

***Save graph
graph export "$output/age-income_line.png", replace //Macintosh
graph export "$output\age-income_line.png", replace //Windows

***Regression coefficients
***Least-squares regression model
***They can be reported in the footnote of the scatterplot

***Generate age squared
gen agesq=age * age

***Income <- Age + Age squared
svy, subpop(if income!=. & income!=0): reg income age agesq

************************************
***TABLE - MEAN INCOME BY AGE GROUP
************************************
***Use aweight to get sample size by age group
table agegr [aweight=perwt] if income!=0, c(mean income sd income n income)

***Regression coefficients
***Reference category: 45-54
***Income <- Age groups
svy, subpop(if income!=. & income!=0): reg income ib45.agegr

************************************
***PEARSON'S r
************************************
***It would be incorrect to use fweight,
***because you would get statistical significance
***by indicating to the test that you have
***more observations than what was actually collected

***aweight preserves sample size and
***it is allowed in Stata to estimate Pearson's r

***Wage and salary income, age
corr income age if income!=0 [aweight=perwt]
pwcorr income age if income!=0 [aweight=perwt] // same as above
pwcorr income age if income!=0 [aweight=perwt], sig // with significance test

***Coefficient of determination (r-squared)
di .2301^2

************************************
***Correlation matrix
************************************
***Note: educational attainment variable is ordinal, not interval-ratio

***Wage and salary income, age, education
pwcorr income age educ if income!=0 [aweight=perwt], sig

***Coefficient of determination (r-squared)
***Income and age
di .2301^2

***Coefficient of determination (r-squared)
***Income and education
di .3359^2

************************************
***ANALYSIS OF VARIANCE (ANOVA)
************************************
***Use aweight to get sample size by age group
table raceth [aweight=perwt] if income!=0, c(mean income sd income n income)

***Total number of cases
count if raceth!=0 & income!=. & income!=0

***One-way ANOVA
oneway income raceth if income!=0 [aweight=perwt]
anova income raceth if income!=0 [aweight=perwt]

************************************
***CLOSING COMMANDS
************************************
***Save data
save "Stata04.dta", replace

***Save log
log close