************************************
************************************
***SOCI 600: INTRODUCTION TO SOCIOLOGICAL DATA ANALYSIS
***SAMPLE WEIGHTS AND BASIC DESCRIPTIVE STATISTICS
************************************
************************************

************************************
***CLEAR MEMORY
************************************
clear all

************************************
***CREATE SHORTCUTS AND LOG FILE
************************************
***Shortcut for folders
global codes  = "H:\course\codes"
global data   = "H:\course\data"
global output = "H:\course\output"

***Start saving results window
log using "$codes\Stata02.log", replace text

************************************
***OPENING COMMANDS
************************************
***Tell Stata to not pause for "more" messages
set more off

***Open 2019 ACS (only Texas)
use "$data\ACS2019.dta", clear

************************************
***GENERATE VARIABLES
************************************
***Sex
gen female=.
  replace female=0 if sex==1 // Male
  replace female=1 if sex==2 // Female

label define female 0 "Male" 1 "Female"
label values female female

***Race/ethnicity
gen raceth=.
  replace raceth=1 if race==1 & hispan==0 // White
  replace raceth=2 if race==2 & hispan==0 // Black
  replace raceth=3 if hispan>=1 & hispan<=4 // Hispanic
  replace raceth=4 if (race==4 | race==5 | race==6) & hispan==0 // Asian
  replace raceth=5 if race==3 & hispan==0 // Native American
  replace raceth=6 if (race==7 | race==8 | race==9) & hispan==0 // Other

label define raceth 1 "White" 2 "African American" 3 "Hispanic" ///
                    4 "Asian" 5 "Native American" 6 "Ohter races"
label values raceth raceth

***Age
egen agegr = cut(age), at(0,16,20,25,35,45,55,65,100)

label define agecode 0 "0-15" 16 "16-19" 20 "20-24" 25 "25-34" ///
                     35 "35-44" 45 "45-54" 55 "55-64" 65 "65-100"
label values agegr agegr

***Educational attainment
gen educgr=.
  replace educgr=1 if educ>=0 & educ<=5 // Less than high school
  replace educgr=2 if educ==6 // High school
  replace educgr=3 if educ==7 | educ==8 // Some college
  replace educgr=4 if educ==10 // College
  replace educgr=5 if educ==11 // 5+ years of college, graduate school

label define educgr 1 "Less than high school" 2 "High school" ///
                    3 "Some college" 4 "College" 5 "Graduate school"
label values educgr educgr

***Marital status
gen marital=.
  replace marital=1 if marst==1 | marst==2 // Married
  replace marital=2 if marst>=3 & marst<=5 // Separated, divorced, widowed
  replace marital=3 if marst==6 // Never married, single

label define marital 1 "Married" 2 "Separated, divorced, widowed" 3 "Never married"
label values marital marital

***Migration status
gen migrant=.
  replace migrant=1 if migrate1d==10 | migrate1d==23 // same house or within PUMA
  replace migrant=2 if migrate1d>=24 & migrate1d<=32 // internal migrant
  replace migrant=3 if migrate1d==40 // international migrant

label define migrant 1 "Non-migrant" 2 "Internal migrant" 3 "International migrant"
label values migrant migrant

***Wage and salary income
gen income=.
  replace income=incwage if incwage!=999999

************************************
***WEIGHT VARIABLES
************************************
***Person weight
sum perwt, d // summary statistics
hist perwt, ylabel(0(10)40) percent // histogram

***Household weight
sum hhwt if pernum==1, d // summary statistics
hist hhwt if pernum==1, ylabel(0(10)40) percent // histogram

************************************
***USING PERSON WEIGHT - SEX
************************************
***No weight
tab sex

***Fweight expands to population size
tab sex [fweight=perwt], m

***Iweight expands to population size
tab sex [iweight=perwt], m

***Aweight maintains sample size
tab sex [aweight=perwt], m

************************************
***USING HOUSEHOLD WEIGHT - HOME OWNERSHIP
************************************
***No weight
tab ownershp if pernum==1

***Fweight expands to population size
tab ownershp if pernum==1 [fweight=perwt], m

***Iweight expands to population size
tab ownershp if pernum==1 [iweight=perwt], m

***Aweight maintains sample size
tab ownershp if pernum==1 [aweight=perwt], m

************************************
***COMPLEX SAMPLE DESIGN
************************************
***singleunit(scaled)
***The scaling factor comes from using
***the average of the variances from the strata with multiple sampling units
***for each stratum with one sampling unit

svyset cluster [pweight=perwt], strata(strata) singleunit(scaled)

************************************
***INCOME: mean, standard error, and confidence interval
************************************
***No weight
mean income

***Fweight expands to population size
***Standard error is artificially low
***Due to high number of observations
mean income [fweight=perwt]

***Iweight expands to population size
***Standard error is artificially low
***Due to high number of observations
mean income [iweight=perwt]

***Aweight maintains sample size
***Still underestimates standard error
mean income [aweight=perwt]

***Pweight maintains sample size
***Still underestimates standard error
mean income [pweight=perwt]

***Complex sample design
svy: mean income 

***Estimate standard deviation from previous command
estat sd

************************************
***COMPLEX SAMPLE DESIGN FOR SUBPOPULATIONS
************************************
***If we consider that missing cases are part of the population,
***we need to inform that subpopulation is only non-missing cases.
***Then, full sample design is taken into account.
svy, subpop(if income!=.): mean income
estat sd

***Survey design for people with 15-64 years of age
svy, subpop(if age>=15 & age<=64): mean income

***If we consider that missing cases are part of the population,
***we need to inform that subpopulation is only non-missing cases.
***Then, full sample design is taken into account.
svy, subpop(if age>=15 & age<=64 & income!=.): mean income

************************************
***SEX
************************************
***It doesn't need complex sample design,
***because we are not calculating standard errors
tab sex [fweight=perwt]

***Number of valid cases
count if sex!=.

***Number of missing cases
count if sex==.

************************************
***RACE/ETHNICITY
************************************
***It doesn't need complex sample design,
***because we are not calculating standard errors
tab raceth [fweight=perwt]

***Number of valid cases
count if raceth!=.

***Number of missing cases
count if raceth==.

************************************
***AGE
************************************
***It doesn't need complex sample design,
***because we are not calculating standard errors
tab agegr [fweight=perwt]

***Number of valid cases
count if agegr!=.

***Number of missing cases
count if agegr==.

************************************
***EDUCATIONAL ATTAINMENT
************************************
***It doesn't need complex sample design,
***because we are not calculating standard errors
tab educgr [fweight=perwt]

***Number of valid cases
count if educgr!=.

***Number of missing cases
count if educgr==.

************************************
***MARITAL STATUS
************************************
***It doesn't need complex sample design,
***because we are not calculating standard errors
tab marital [fweight=perwt]

***Number of valid cases
count if marital!=.

***Number of missing cases
count if marital==.

************************************
***MIGRANT STATUS
************************************
***It doesn't need complex sample design,
***because we are not calculating standard errors
tab migrant [fweight=perwt]

***Number of valid cases
count if migrant!=.

***Number of missing cases
count if migrant==.

************************************
***WAGE AND SALARY INCOME
***Focus on those with some income
***(exclude those with zero income)
************************************
***Complex survey design
***Total
svy, subpop(if income!=. & income!=0): mean income
estat sd

***Men
svy, subpop(if income!=. & income!=0 & female==0): mean income
estat sd

***Women
svy, subpop(if income!=. & income!=0 & female==1): mean income
estat sd

***Population size with valid information on income
tab sex [fweight=perwt] if income!=. & income!=0

***Sample size with valid information on income
count if income!=. & income!=0

***Number of missing cases or zero income
count if income==. | income==0

***Histogram of income
hist income [fweight=perwt] if income!=0, freq normal
hist income [fweight=perwt] if income!=0, percent normal

************************************
***PIE GRAPH - RACE/ETHNICITY
************************************
***No label
graph pie [fweight=perwt], over(raceth)

***Explode all slices
graph pie [fweight=perwt], over(raceth) pie(_all, explode)

***Include label
graph pie [fweight=perwt], over(raceth) plabel(_all percent)

***Include label and explode all slices
graph pie [fweight=perwt], over(raceth) plabel(_all percent) pie(_all, explode)
						   
***Save graph
graph export "$output\race-ethnicity_pie.png", replace

************************************
***COLUMN GRAPH - RACE/ETHNICITY
************************************
graph bar [fweight=perwt], over(raceth, label(angle(45))) ytitle("Percent")

***Save graph
graph export "$output\race-ethnicity_column.png", replace

************************************
***HISTOGRAMS - AGE
************************************
***Histogram of age
hist age [fweight=perwt], percent discrete xlabel(0(10)100) xtitle("Age")

***Save graph
graph export "$output\age_histogram.png", replace

***Histogram of age by sex
hist age [fweight=perwt], percent discrete by(female) xlabel(0(10)100) xtitle("Age")

***Overlaying histograms of age by sex
twoway (histogram age if female==0 [fweight=perwt], frequency discrete xlabel(0(10)100) fcolor(gs11) lcolor(gs11))  ///
       (histogram age if female==1 [fweight=perwt], frequency discrete xlabel(0(10)100) fcolor(none) lcolor(black) ///
       legend(order(1 "Males" 2 "Females")) ///
       xtitle("Age"))

***Save graph
graph export "$output\age-sex_histogram.png", replace

************************************
***BOXPLOT - INCOME
************************************
***Vertical boxplot of income
graph box income if income!=0 [fweight=perwt], ytitle(Wage and salary income)

***Horizontal boxplot of income
graph hbox income if income!=0 [fweight=perwt], ytitle(Wage and salary income)

***Save graph
graph export "$output\income_boxplot.png", replace

************************************
***SCATTER PLOT - INCOME VERSUS AGE
***Weights don't work well with scatter plots (change size of dots)
************************************
***Scatter plot of age by income
scatter income age
twoway (scatter income age) (lfit income age)

***Save graph
graph export "$output\age-income_scatter.png", replace

************************************
***BAR GRAPH - AGE-SEX STRUCTURE
************************************
***Generate five-year age groups variable - automatically
egen age5y = cut(age), at(0,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75,80,85,100)
table age5y, stat(min age) stat(max age) stat(count age)

***Generate male variable (opposite of female variable)
gen male=!female
tab male female, m nolabel

***Generate variables with male and female totals by five-year age groups
sort age5y
by age5y: egen maletotal=total(male)
by age5y: egen femaletotal=total(female)

***Replace male total by negative value
replace maletotal=-maletotal

***Age-sex structure
twoway bar maletotal age5y [fweight=perwt], horizontal barwidth(5) fcolor(navy) lcolor(black) lwidth(medium) || ///
       bar femaletotal age5y [fweight=perwt], horizontal barwidth(5) fcolor(maroon) lcolor(black) lwidth(medium) ///
       legend(label(1 Males) label(2 Females)) ///
       ytitle("Age group") ///
       ylabel(0(5)85, angle(horizontal) valuelabel labsize(*.8)) ///
       xtitle("Population size") ///
       xlabel(-10000 "10000" -5000 "5000" 0 5000 10000) ///
       title("Age-sex structure, Texas") ///
       subtitle("2019 American Community Survey")

***Save graph
graph export "$output\age-sex_bar.png", replace

************************************
***CLOSING COMMANDS
************************************
***Save data
save "$data\Stata02.dta", replace

***Save log
log close