merge()
function.# clear workspace
rm(list=ls())
# re-load the communities data set from the lab
com <- read.csv("C:/Users/phili/Dropbox/Essex 2017 ML/Day 2/Lab/communities.csv")
# read in the immigration dataset
immi <- read.csv("http://uclspp.github.io/PUBLG100/data/communities_immig.csv")
# load data set on employment
empl <- read.csv("http://uclspp.github.io/PUBLG100/data/communities_employment.csv")
The 2 variables that identify an observation are state
and communityname
. We need these variables in our seperate data sets to merge. We drop other overlapping variables but that is not necessary. If you don’t, R will paste the .x
and .y
to the variable names. You can drop by looking up which variables overlap and deleting them from one dataset by hand. Below is a way to automate this.
# the names of the dataset com excluding state and communityname
ivs.empl <- names(empl)[!names(empl) %in% c("state", "communityname")]
# the names of the dataset com excluding state and communityname
ivs.com <- names(com)[!names(com) %in% c("state", "communityname")]
# overlaps
drop <- ivs.empl[ivs.empl %in% ivs.com]
# drop variables that are in both datasets
empl[, drop] <- NULL
# 1) merge com and empl:
com <- merge(com, empl, by = c("state", "communityname"))
# 2) merge immi to the data sets
com <- merge(com, immi, by = c("state", "communityname"))
# remove the now unecessary data sets immi and empl
rm(immi, empl, drop, ivs.com, ivs.empl)
PctImmigRec5
variable to RecentImmigration5
.We use select()
from dplyr
to rename the variables and make the data set a litte bit smaller and easier to deal with.
com <- dplyr::select(com,
community = communityname,
unemploymentrate = PctUnemployed,
nohighschool = PctNotHSGrad,
white = racePctWhite,
recentimmigration5 = PctImmigRec5)
RecentImmigration5
.m_immi <- lm(unemploymentrate ~ recentimmigration5, data = com)
summary(m_immi)
##
## Call:
## lm(formula = unemploymentrate ~ recentimmigration5, data = com)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.3772 -0.1461 -0.0397 0.1128 0.6746
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.325347 0.008917 36.488 < 2e-16 ***
## recentimmigration5 0.105882 0.021344 4.961 7.62e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.201 on 1992 degrees of freedom
## Multiple R-squared: 0.0122, Adjusted R-squared: 0.01171
## F-statistic: 24.61 on 1 and 1992 DF, p-value: 7.621e-07
We draw a scatterplot using the plot()
function and abline()
to draw the regression line.
# the scatterplot
plot(com$unemploymentrate ~ com$recentimmigration5,
xlab = "Recent Immigration",
ylab = "Unemployment Rate",
bty = "n", pch = 16)
# add the regression line
abline(m_immi, col = "red", lwd = 2)
m_edu <- lm(unemploymentrate ~ nohighschool, data = com)
# minority percentage
com$minority <- 1 - com$white
m_minority <- lm(unemploymentrate ~ minority, data = com)
texreg::screenreg(list(m_edu, m_minority, m_immi))
##
## =========================================================
## Model 1 Model 2 Model 3
## ---------------------------------------------------------
## (Intercept) 0.08 *** 0.26 *** 0.33 ***
## (0.01) (0.01) (0.01)
## nohighschool 0.74 ***
## (0.01)
## minority 0.43 ***
## (0.02)
## recentimmigration5 0.11 ***
## (0.02)
## ---------------------------------------------------------
## R^2 0.55 0.27 0.01
## Adj. R^2 0.55 0.27 0.01
## Num. obs. 1994 1994 1994
## RMSE 0.14 0.17 0.20
## =========================================================
## *** p < 0.001, ** p < 0.01, * p < 0.05
texreg::htmlreg(list(m_edu, m_minority, m_immi), file = "Lab2_model_comparison.doc")
## The table was written to the file 'Lab2_model_comparison.doc'.
Zelig
.library(Zelig)
## Warning: package 'Zelig' was built under R version 3.4.1
z.out <- zelig(unemploymentrate ~ recentimmigration5, data = com, model = "ls")
## How to cite this model in Zelig:
## R Core Team. 2007.
## ls: Least Squares Regression for Continuous Dependent Variables
## in Christine Choirat, Christopher Gandrud, James Honaker, Kosuke Imai, Gary King, and Olivia Lau,
## "Zelig: Everyone's Statistical Software," http://zeligproject.org/
x.out <- setx(z.out, recentimmigration5 = seq(0, 1, 0.1))
s.out <- sim(z.out, x = x.out, n=1000)
png("task8_plot.png")
ci.plot(s.out, xlab = "Recent Immigration (last 5 years)", ci = 95)
dev.off()
## png
## 2