************************************
************************************
***SOCI 600: INTRODUCTION TO SOCIOLOGICAL DATA ANALYSIS
***BIVARIATE ASSOCIATIONS FOR NOMINAL-, ORDINAL-, AND INTERVAL-RATIO-LEVEL VARIABLES
************************************
************************************

************************************
***CLEAR MEMORY
************************************
clear all

************************************
***CREATE SHORTCUTS AND LOG FILE
************************************
***Shortcut for folders
global codes  = "H:\course\codes"
global data   = "H:\course\data"
global output = "H:\course\output"

***Start saving results window
log using "$codes\Stata07.log", replace text

************************************
***OPENING COMMANDS
************************************
***Tell Stata to not pause for "more" messages
set more off

***Open 2019 ACS (only Texas)
use "$data\ACS2019.dta", clear

***Complex survey design
svyset cluster [pweight=perwt], strata(strata) singleunit(scaled)

************************************
***GENERATE VARIABLES
************************************
***Sex
gen female=.
  replace female=0 if sex==1 // Male
  replace female=1 if sex==2 // Female

label define female 0 "Male" 1 "Female"
label values female female

***Race/ethnicity
gen raceth=.
  replace raceth=1 if race==1 & hispan==0 // White
  replace raceth=2 if race==2 & hispan==0 // Black
  replace raceth=3 if hispan>=1 & hispan<=4 // Hispanic
  replace raceth=4 if (race==4 | race==5 | race==6) & hispan==0 // Asian
  replace raceth=5 if race==3 & hispan==0 // Native American
  replace raceth=6 if (race==7 | race==8 | race==9) & hispan==0 // Other

label define raceth 1 "White" 2 "African American" 3 "Hispanic" ///
                    4 "Asian" 5 "Native American" 6 "Other races"
label values raceth raceth

***Age
egen agegr = cut(age), at(0,16,20,25,35,45,55,65,100)

label define agecode 0 "0-15" 16 "16-19" 20 "20-24" 25 "25-34" ///
                     35 "35-44" 45 "45-54" 55 "55-64" 65 "65-100"
label values agegr agegr

***Educational attainment
gen educgr=.
  replace educgr=1 if educ>=0 & educ<=5 // Less than high school
  replace educgr=2 if educ==6 // High school
  replace educgr=3 if educ==7 | educ==8 // Some college
  replace educgr=4 if educ==10 // College
  replace educgr=5 if educ==11 // 5+ years of college, graduate school

label define educgr 1 "Less than high school" 2 "High school" ///
                    3 "Some college" 4 "College" 5 "Graduate school"
label values educgr educgr

***Marital status
gen marital=.
  replace marital=1 if marst==1 | marst==2 // Married
  replace marital=2 if marst>=3 & marst<=5 // Separated, divorced, widowed
  replace marital=3 if marst==6 // Never married, single

label define marital 1 "Married" 2 "Separated, divorced, widowed" 3 "Never married"
label values marital marital

***Migration status
gen migrant=.
  replace migrant=1 if migrate1d==10 | migrate1d==23 // same house or within PUMA
  replace migrant=2 if migrate1d>=24 & migrate1d<=32 // internal migrant
  replace migrant=3 if migrate1d==40 // international migrant

label define migrant 1 "Non-migrant" 2 "Internal migrant" 3 "International migrant"
label values migrant migrant

***Internal migration status (domestic migration)
gen dommig=.
  replace dommig=0 if migrant==1 // non-migrant
  replace dommig=1 if migrant==2 // internal migrant
  
label define dommig 0 "Non-migrant" 1 "Internal migrant"
label values dommig dommig

tab migrant dommig, m

***International migration status
gen intmig=.
  replace intmig=0 if migrant==1 // non-migrant
  replace intmig=1 if migrant==3 // international migrant
  
label define intmig 0 "Non-migrant" 1 "International migrant"
label values intmig intmig

tab migrant intmig, m

***Wage and salary income
gen income=.
  replace income=incwage if incwage!=999999

************************************
***ASSOCIATIONS BETWEEN NOMINAL-LEVEL VARIABLES
************************************

************************************
***PHI - Internal migration by sex
************************************
***Remember to report column percentages
***taking into account survey weights
tab dommig female [fweight=perwt], col nofreq // column percentages
tab dommig female [fweight=perwt] // population size
tab dommig female // sample size
tab dommig female, m // missing cases

***Phi correlation coefficient
*Phi is designed to measure the degree
*of relation for two binary variables
*(i.e., dichotomous variables, dummy variables)

*To compute Phi, first convert the binary variables into 1's and 0's,
*and estimate the Pearson'r correlation
corr dommig female // in this case, Pearson's r correlation same as Phi
pwcorr dommig female // same as above
pwcorr dommig female, sig // Phi with test of significance

************************************
***CHI SQUARE, LAMBDA, CRAMER'S V - Migration status by race/ethnicity
************************************
***Remember to report column percentages
***taking into account survey weights
tab migrant raceth [fweight=perwt], col nofreq // column percentages
tab migrant raceth [fweight=perwt] // population size
tab migrant raceth // sample size
tab migrant raceth, m // missing cases

***Chi square
tab migrant raceth, chi // weights not allowed
svy: tab migrant raceth // chi square test with complex survey design (correct form)

***Cramer's V
tab migrant raceth, V // weights not allowed

***Chi square, Cramer's V
tab migrant raceth, chi V // weights not allowed

***Lambda
*If your Stata doesn't have the lambda command,
*type "ssc install lambda" to install it.
*ssc install lambda

*Note: When row totals are very unequal,
*Lambda can be zero even when there is an association between the variables.

*For very unequal row marginals, it's better to use
*a Chi Square based measure of association.
lambda migrant raceth [aweight=perwt]

************************************
***ASSOCIATIONS BETWEEN ORDINAL-LEVEL VARIABLES
************************************

************************************
***GAMMA - Education group by age group
************************************
***Remember to report column percentages
***taking into account survey weights
tab educgr agegr [fweight=perwt], col nofreq // column percentages
tab educgr agegr [fweight=perwt] // population size
tab educgr agegr // sample size
tab educgr agegr, m // missing cases

***Gamma measures the strength and pattern/direction of the association
tab educgr agegr, gamma // weights not allowed

***Test statistic: Z = gamma / ASE
***ASE: Asymptotic Standard Error
di 0.4425/0.002 // test statistic

***p-value
***"normal" command calculates area under the curve below the Z-score
***If Z is positive, p-value (one-tailed test): di 1-normal(Z)
***If Z is negative, p-value (one-tailed test): di normal(Z)
di 1-normal(221.25) // p-value

************************************
***SPEARMAN'S RHO - Years of schooling by age
************************************
tab educ, m
tab age, m
tab educ age // too many values

***Total number of cases
count if educ!=. & age!=.

***Spearman's rho (rank correlation coefficient)
spearman educ age // weights not allowed

***Spearman's rho squared
di 0.4903 * 0.4903
di 0.4903^2

************************************
***ASSOCIATIONS BETWEEN INTERVAL-RATIO-LEVEL VARIABLES
************************************

************************************
***SCATTERPLOT - Income by age
************************************
***Scatterplot without regression line
twoway scatter income age

***Scatterplot with regression line
twoway scatter income age || lfit income age if income!=0, ///
	   ytitle(Wage and salary income) xtitle(Age)

twoway (scatter income age) (lfit income age) if income!=0, ///
	   ytitle(Wage and salary income) xtitle(Age)
	   
***Save graph
graph export "$output\age-income_scatter.png", replace

***Regression coefficients
***Least-squares regression model
***They can be reported in the footnote of the scatterplot
***Income = F(Age)
svy, subpop(if income!=. & income!=0): reg income age

************************************
***LINE GRAPH - Mean income by age
************************************
***Generate variable with mean income by age
bysort age: egen mincage=mean(income) if income!=0
sum mincage, d

***Line graph of income by age
twoway line mincage age [fweight=perwt], ///
  ytitle("Mean wage and salary income") ylabel(0(20000)80000)

***Save graph
graph export "$output\age-income_line.png", replace

***Regression coefficients
***Least-squares regression model
***They can be reported in the footnote of the scatterplot

***Generate age squared
gen agesq=age * age

***Income = F(Age, Age squared)
svy, subpop(if income!=. & income!=0): reg income age agesq

************************************
***TABLE - Mean income by age group
************************************
***Use "aweight" to get sample size by age group
tabstat income [aweight=perwt] if income!=0, by(agegr) stat(mean sd n)

***Regression coefficients
***Reference category: 45-54
***Income = F(Age groups)
svy, subpop(if income!=. & income!=0): reg income ib45.agegr

************************************
***PEARSON'S r
************************************
***It would be incorrect to use fweight,
***because you would get statistical significance
***by indicating to the test that you have
***more observations than what was actually collected

***"aweight" preserves sample size and
***it is allowed in Stata to estimate Pearson's r

***Wage and salary income, age
corr income age if income!=0 [aweight=perwt]
pwcorr income age if income!=0 [aweight=perwt] // same as above
pwcorr income age if income!=0 [aweight=perwt], sig // with significance test

***Coefficient of determination (r-squared)
di .2238^2

************************************
***Correlation matrix
************************************
***Note: educational attainment variable is ordinal, not interval-ratio

***Total number of cases
count if income!=0 & income!=. & age!=. & educ!=.

***Wage and salary income, age, education
pwcorr income age educ if income!=0 [aweight=perwt], sig

***Coefficient of determination (r-squared)
***Income and age
di .2238^2

***Coefficient of determination (r-squared)
***Income and education
di .3326^2

************************************
***CLOSING COMMANDS
************************************
***Save data
save "$data\Stata07.dta", replace

***Save log
log close