r代写-HW4
时间:2022-03-07
HW4
Yuxian Chen
2/23/2022
# Load package
#install.packages("tidyverse")
#install.packages("moderndive") # note: I needed to *not* restart R
#install.packages("skimr")
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.8
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.1.2 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(moderndive)
library(skimr)
library(ggplot2)
library(dplyr)
library(tinytex)
library(gridExtra)
##
## Attaching package: ’gridExtra’
## The following object is masked from ’package:dplyr’:
##
## combine
load(file="MA_COVID19_22_02_15.RData")
load(file="MA_county_variables_2020.RData")
1
There are 14 observational units in ma_extra and 10594 observational units in mass.
1
2ma_extra$true_pct_NHWA_2019 <- ma_extra$pct_NHWA_2019 * 100
ma_extra$pct_nonwhite <- 100 - ma_extra$true_pct_NHWA_2019
mass_filtered <- mass %>% filter(date == max(date)) %>% filter(county != "Unknown")
mass_cov <- ma_extra %>% left_join(mass_filtered, by="county")
a
mass_cov$deaths_per_100000 <- 100000 * (mass_cov$deaths/mass_cov$X2019_pop_est)
pct_nonwhite <- mass_cov %>% get_correlation(formula = deaths_per_100000 ~ pct_nonwhite)
people_per_Housing <- mass_cov %>% get_correlation(formula = deaths_per_100000 ~ people_per_Housing)
median_age <- mass_cov %>% get_correlation(formula = deaths_per_100000 ~ X2019_median_age)
median_income <- mass_cov %>% get_correlation(formula = deaths_per_100000 ~ Median_Household_Income_2018)
percent_poverty <- mass_cov %>% get_correlation(formula = deaths_per_100000 ~ PCTPOVALL_2018)
pct_nonwhite
## cor
## 1 0.1710611
people_per_Housing
## cor
## 1 0.7920982
median_age
## cor
## 1 -0.1846349
median_income
## cor
## 1 -0.2686711
percent_poverty
## cor
## 1 0.3826068
People_per_housing has the strongest correlation of value 0.792, which makes sense since peo-
ple_per_housing is a combined indicator of factors including poverty and household income.
b
2
glimpse(mass_cov)
## Rows: 14
## Columns: 36
## $ county Barnstable, Berkshire, Bris~
## $ county_FIPS 25001, 25003, 25005, 25007,~
## $ X2019_unemployment 3.9, 3.5, 3.7, 4.1, 3.0, 2.~
## $ Median_Household_Income_2018 69001, 58375, 66005, 70224,~
## $ X2019_pop_est 212990, 124944, 565217, 173~
## $ X2019_age_85_plus 8461, 4228, 13659, 415, 191~
## $ X2019_median_age 54.3, 47.6, 41.1, 48.5, 40.~
## $ X2019_housing_units 164674, 69212, 236915, 1814~
## $ TOT_MALE_2019 101791, 60406, 273426, 8545~
## $ TOT_FEMALE_2019 111199, 64538, 291791, 8787~
## $ NHWA_MALE_2019 90410, 52759, 222482, 7403,~
## $ NHWA_FEMALE_2019 100238, 56764, 239860, 7660~
## $ Tot_NHWA_2019 190648, 109523, 462342, 150~
## $ pct_NHWA_2019 0.8951031, 0.8765767, 0.817~
## $ people_per_Housing 1.2934039, 1.8052361, 2.385~
## $ june_19_unemployment 3.1, 3.5, 3.9, 2.8, 3.2, 2.~
## $ june_20_unemployment 18.5, 16.2, 19.3, 13.7, 19.~
## $ Rural_urban_Continuum_Code_2013 3, 3, 1, 7, 1, 4, 2, 2, 1, ~
## $ Urban_Influence_Code_2013 2, 2, 1, 8, 1, 5, 2, 2, 1, ~
## $ Percent_with_a_bachelor_degree_2014_18 42.8, 33.4, 28.0, 44.2, 39.~
## $ Percent_with_less_than_high_school_2014_18 4.5, 8.9, 15.1, 4.5, 10.7, ~
## $ POVALL_2018 16661, 13673, 59147, 1316, ~
## $ PCTPOVALL_2018 8.0, 11.3, 10.8, 7.7, 10.7,~
## $ Urban_rural urban, urban, urban, rural,~
## $ true_pct_NHWA_2019 89.51031, 87.65767, 81.7990~
## $ pct_nonwhite 10.489694, 12.342329, 18.20~
## $ date 2022-02-14, 2022-02-14, 20~
## $ state Massachusetts, Massachusett~
## $ fips 25001, 25003, 25005, 25007,~
## $ cases 33898, 21935, 146917, 3361,~
## $ deaths 647, 389, 2390, 0, 3051, 15~
## $ new_cases 60, 115, 287, 1, 309, 55, 3~
## $ new_deaths 2, 1, 7, 0, 4, 2, 8, 1, 11,~
## $ new_cases_07_day NA, NA, NA, NA, NA, NA, NA,~
## $ new_deaths_07_day NA, NA, NA, NA, NA, NA, NA,~
## $ deaths_per_100000 303.7701, 311.3395, 422.846~
mass_cov %>% select(deaths_per_100000, pct_nonwhite, people_per_Housing,
Median_Household_Income_2018, X2019_median_age,
PCTPOVALL_2018) %>% skim_without_charts()
Table 1: Data summary
Name Piped data
Number of rows 14
Number of columns 6
_______________________
3
Table 1: Data summary
Column type frequency:
numeric 6
________________________
Group variables None
Variable type: numeric
skim_variable n_missingcomplete_ratemean sd p0 p25 p50 p75 p100
deaths_per_100000 0 1 279.24 133.01 0.00 253.89 305.12 363.47 428.84
pct_nonwhite 0 1 23.71 12.46 9.62 13.99 21.58 28.88 54.82
people_per_Housing 0 1 2.08 0.60 0.90 1.87 2.40 2.50 2.51
Median_Household_Income_20180 1 75312.79 15742.53 52682.00 66689.50 70995.50 86351.75 100374.00
X2019_median_age 0 1 42.26 5.44 33.30 39.38 40.85 46.13 54.30
PCTPOVALL_2018 0 1 9.95 3.58 5.90 7.40 9.80 11.02 17.50
regression <- lm(deaths_per_100000 ~ people_per_Housing, data = mass_cov)
plot(mass_cov$people_per_Housing, mass_cov$deaths_per_100000)
abline(regression)
1.0 1.5 2.0 2.5
0
10
0
20
0
30
0
40
0
mass_cov$people_per_Housing
m
a
ss
_
co
v$
de
ath
s_
pe
r_1
00
00
0
### c
4
summary(regression)
##
## Call:
## lm(formula = deaths_per_100000 ~ people_per_Housing, data = mass_cov)
##
## Residuals:
## Min 1Q Median 3Q Max
## -107.25 -64.55 -21.25 67.94 162.76
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -87.11 84.57 -1.030 0.323300
## people_per_Housing 176.37 39.24 4.495 0.000733 ***
## ---
## Signif. codes: 0 ’***’ 0.001 ’**’ 0.01 ’*’ 0.05 ’.’ 0.1 ’ ’ 1
##
## Residual standard error: 84.5 on 12 degrees of freedom
## Multiple R-squared: 0.6274, Adjusted R-squared: 0.5964
## F-statistic: 20.21 on 1 and 12 DF, p-value: 0.0007326
The regression line is y = 176.37x - 87.11
d
These two variables are positively related since the slope is positive. The negative intercept shifts the entire
regression line downwards.
e
regression
##
## Call:
## lm(formula = deaths_per_100000 ~ people_per_Housing, data = mass_cov)
##
## Coefficients:
## (Intercept) people_per_Housing
## -87.11 176.37
res = get_regression_points(regression, ID="county")
res %>% arrange(desc(residual))
## # A tibble: 14 x 5
## county deaths_per_100000 people_per_Housing deaths_per_100000_h~ residual
##
## 1 Barnstable 304. 1.29 141. 163.
## 2 Hampden 429. 2.40 337. 91.7
## 3 Bristol 423. 2.39 334. 89.2
5
## 4 Berkshire 311. 1.80 231. 80.1
## 5 Essex 387. 2.51 355. 31.6
## 6 Plymouth 368. 2.49 352. 16.2
## 7 Worcester 350. 2.46 346. 3.99
## 8 Suffolk 277. 2.33 323. -46.5
## 9 Norfolk 306. 2.51 355. -48.8
## 10 Franklin 224. 2.05 275. -51.0
## 11 Middlesex 284. 2.50 353. -69.0
## 12 Nantucket 0 0.899 71.5 -71.5
## 13 Dukes 0 0.955 81.4 -81.4
## 14 Hampshire 246. 2.50 353. -107.
Barnstable has especially large residuals and Hampshire has small residuals. Large residual values indicate
that those counties has greater number of deaths per 100000 than predicted and vice versa for counties with
especially low residual values. This may be due to factors such as availability of testing, reporting efficiency,
etc.
3
a
ma_extra
<- ma_extra %>% mutate(region=case_when(county %in%
c("Hampshire", "Franklin", "Hampden", "Berkshire", "Worcester") ~
"West",
county %in% c("Barnstable", "Dukes", "Nantucket", "Plymouth", "Bristol") ~ "Cape",
county %in% c("Essex", "Middlesex", "Suffolk", "Norfolk") ~ "East"))
ma_extra %>% select(county, region)
## county region
## 1 Barnstable Cape
## 2 Berkshire West
## 3 Bristol Cape
## 4 Dukes Cape
## 5 Essex East
## 6 Franklin West
## 7 Hampden West
## 8 Hampshire West
## 9 Middlesex East
## 10 Nantucket Cape
## 11 Norfolk East
## 12 Plymouth Cape
## 13 Suffolk East
## 14 Worcester West
The levels of region variable are East, West and Cape.
b
mass_cov <- ma_extra %>% left_join(mass_filtered, by="county")
mass_cov$cases_per_100000 <- 100000 * (mass_cov$cases/mass_cov$X2019_pop_est)
glimpse(mass_cov)
6
## Rows: 14
## Columns: 37
## $ county Barnstable, Berkshire, Bris~
## $ county_FIPS 25001, 25003, 25005, 25007,~
## $ X2019_unemployment 3.9, 3.5, 3.7, 4.1, 3.0, 2.~
## $ Median_Household_Income_2018 69001, 58375, 66005, 70224,~
## $ X2019_pop_est 212990, 124944, 565217, 173~
## $ X2019_age_85_plus 8461, 4228, 13659, 415, 191~
## $ X2019_median_age 54.3, 47.6, 41.1, 48.5, 40.~
## $ X2019_housing_units 164674, 69212, 236915, 1814~
## $ TOT_MALE_2019 101791, 60406, 273426, 8545~
## $ TOT_FEMALE_2019 111199, 64538, 291791, 8787~
## $ NHWA_MALE_2019 90410, 52759, 222482, 7403,~
## $ NHWA_FEMALE_2019 100238, 56764, 239860, 7660~
## $ Tot_NHWA_2019 190648, 109523, 462342, 150~
## $ pct_NHWA_2019 0.8951031, 0.8765767, 0.817~
## $ people_per_Housing 1.2934039, 1.8052361, 2.385~
## $ june_19_unemployment 3.1, 3.5, 3.9, 2.8, 3.2, 2.~
## $ june_20_unemployment 18.5, 16.2, 19.3, 13.7, 19.~
## $ Rural_urban_Continuum_Code_2013 3, 3, 1, 7, 1, 4, 2, 2, 1, ~
## $ Urban_Influence_Code_2013 2, 2, 1, 8, 1, 5, 2, 2, 1, ~
## $ Percent_with_a_bachelor_degree_2014_18 42.8, 33.4, 28.0, 44.2, 39.~
## $ Percent_with_less_than_high_school_2014_18 4.5, 8.9, 15.1, 4.5, 10.7, ~
## $ POVALL_2018 16661, 13673, 59147, 1316, ~
## $ PCTPOVALL_2018 8.0, 11.3, 10.8, 7.7, 10.7,~
## $ Urban_rural urban, urban, urban, rural,~
## $ true_pct_NHWA_2019 89.51031, 87.65767, 81.7990~
## $ pct_nonwhite 10.489694, 12.342329, 18.20~
## $ region "Cape", "West", "Cape", "Ca~
## $ date 2022-02-14, 2022-02-14, 20~
## $ state Massachusetts, Massachusett~
## $ fips 25001, 25003, 25005, 25007,~
## $ cases 33898, 21935, 146917, 3361,~
## $ deaths 647, 389, 2390, 0, 3051, 15~
## $ new_cases 60, 115, 287, 1, 309, 55, 3~
## $ new_deaths 2, 1, 7, 0, 4, 2, 8, 1, 11,~
## $ new_cases_07_day NA, NA, NA, NA, NA, NA, NA,~
## $ new_deaths_07_day NA, NA, NA, NA, NA, NA, NA,~
## $ cases_per_100000 15915.30, 17555.87, 25993.0~
mass_cov %>% select(cases_per_100000, region) %>% skim_without_charts()
Table 3: Data summary
Name Piped data
Number of rows 14
Number of columns 2
_______________________
Column type frequency:
character 1
numeric 1
________________________
7
Table 3: Data summary
Group variables None
Variable type: character
skim_variable n_missing complete_rate min max empty n_unique whitespace
region 0 1 4 4 0 3 0
Variable type: numeric
skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100
cases_per_100000 0 1 21336.83 4801.86 14317.47 17670.84 20568.93 25447.26 29178
mass_cov %>% ggplot(aes(x=region, y=cases_per_100000)) + geom_boxplot() +
ggtitle("Cases per 100000 in East,West,Cape region")
15000
20000
25000
Cape East West
region
ca
se
s_
pe
r_
10
00
00
Cases per 100000 in East,West,Cape region
c
8
regression1 <- lm(cases_per_100000 ~ region, data=mass_cov)
regression1
##
## Call:
## lm(formula = cases_per_100000 ~ region, data = mass_cov)
##
## Coefficients:
## (Intercept) regionEast regionWest
## 22440.8 -381.4 -2786.0
y = -381.4 * East - 2786 * West + 22440.8 For east: y = -381.4 * East For west: y = -2786 * West For cape:
y = 22440.8
e
The baseline here is the Cape. The mean cases per 100000 for counties in the Cape region is the y-intercept
of our model which is 22440.8. -381.4 and -2786 are the differences in mean cases per 100000 of East and
Weat relative to Cape. When we have our baseline in regression, we can use it to compare with other two
regions. If we have the county in the region then our indicator will be 1, otherwise 0.
4
For solid line: (3 - 2.5)ˆ2 + (2 - 1)ˆ2 + (2 - 1.5)ˆ2 = 1.5
For dashed line: (2 - 2)ˆ2 + (1.5 - 1)ˆ2 + (3 - 1)ˆ2 = 4.25
For dotted line: (2.5 - 2)ˆ2 + (2.5 - 1)ˆ2 + (2.5 - 3)ˆ2 = 2.75
The solid line has smallest sum of squared residuals value.
9
Homework 3
Yuxian Chen
2/12/2022
library(ggplot2)
library(dplyr)
##
## Attaching package: ’dplyr’
## The following objects are masked from ’package:stats’:
##
## filter, lag
## The following objects are masked from ’package:base’:
##
## intersect, setdiff, setequal, union
library(tinytex)
library(gridExtra)
##
## Attaching package: ’gridExtra’
## The following object is masked from ’package:dplyr’:
##
## combine
# add appropriate set-up. refer to in-class exercise
counties<-read.csv("https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv")
1
a
mass2<-counties %>% filter(state=="Massachusetts")
mass2 <- mutate(mass2, date=as.Date(date), county=droplevels(mass2)$county)
glimpse(mass2)
1
## Rows: 10,579
## Columns: 6
## $ date 2020-02-01, 2020-02-02, 2020-02-03, 2020-02-04, 2020-02-05, 20~
## $ county "Suffolk", "Suffolk", "Suffolk", "Suffolk", "Suffolk", "Suffolk~
## $ state "Massachusetts", "Massachusetts", "Massachusetts", "Massachuset~
## $ fips 25025, 25025, 25025, 25025, 25025, 25025, 25025, 25025, 25025, ~
## $ cases 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ~
## $ deaths 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
rm(counties)
b
mass2<-mass2 %>% group_by(county) %>%
mutate(new_deaths=deaths-lag(deaths, default=0, order_by=date))
mass2<-mass2 %>% group_by(county) %>%
mutate(new_cases=cases-lag(cases, default=0, order_by=date))
mass2 %>% filter(deaths >= 0 & county != "Unknown") %>%
ggplot(aes(x=date, y=new_deaths)) + geom_line() + facet_wrap(~county)
Suffolk Worcester
Middlesex Nantucket Norfolk Plymouth
Essex Franklin Hampden Hampshire
Barnstable Berkshire Bristol Dukes
2020−012020−072021−012021−072022−01020−012020−072021−012021−072022−01
2020−012020−072021−012021−072022−01020−012020−072021−012021−072022−01
0
25
50
75
0
25
50
75
0
25
50
75
0
25
50
75
date
n
ew
_
de
at
hs
2
clibrary(zoo)
##
## Attaching package: ’zoo’
## The following objects are masked from ’package:base’:
##
## as.Date, as.Date.numeric
mass2 <- mass2 %>% mutate(new_cases_07_day = zoo::rollmean(new_cases, k = 7, fill = NA))
mass2 <- mass2 %>% mutate(new_deaths_07_day = zoo::rollmean(new_deaths, k = 7, fill = NA))
mass2 %>% filter(deaths >= 0 & county != "Unknown") %>%
ggplot(aes(x=date, y=new_deaths_07_day)) + geom_line() + facet_wrap(~county)
## Warning: Removed 6 row(s) containing missing values (geom_path).
Suffolk Worcester
Middlesex Nantucket Norfolk Plymouth
Essex Franklin Hampden Hampshire
Barnstable Berkshire Bristol Dukes
2020−012020−072021−012021−072022−01020−012020−072021−012021−072022−01
2020−012020−072021−012021−072022−01020−012020−072021−012021−072022−01
0
10
20
30
40
0
10
20
30
40
0
10
20
30
40
0
10
20
30
40
date
n
ew
_
de
at
hs
_0
7_
da
y
d
3
mass_totals2 <- aggregate(cbind(cases, deaths, new_cases, new_deaths,
new_cases_07_day, new_deaths_07_day) ~
date, data=mass2, FUN = sum, na.rm=FALSE)
glimpse(mass_totals2)
## Rows: 738
## Columns: 7
## $ date 2020-02-04, 2020-02-05, 2020-02-06, 2020-02-07, 202~
## $ cases 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1~
## $ deaths 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ new_cases 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ new_deaths 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ new_cases_07_day 0.1428571, 0.0000000, 0.0000000, 0.0000000, 0.000000~
## $ new_deaths_07_day 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
2
a
load(file="MA_COVID19_22_02_08.RData")
load(file="MA_county_variables_2020.RData")
ma_extra$pct_over_85 = ma_extra$X2019_age_85_plus/sum(ma_extra$X2019_age_85_plus)
ma_extra %>% ggplot(aes(x=pct_over_85, fill=county)) + geom_histogram(bins=10) +
ggtitle("pct_over_85")
4
01
2
3
4
0.00 0.05 0.10 0.15 0.20
pct_over_85
co
u
n
t
county
Barnstable
Berkshire
Bristol
Dukes
Essex
Franklin
Hampden
Hampshire
Middlesex
Nantucket
Norfolk
Plymouth
Suffolk
Worcester
pct_over_85
b
ma_extra$true_pct_NHWA_2019 = ma_extra$pct_NHWA_2019 * 100
ma_extra %>% ggplot(aes(x=true_pct_NHWA_2019, fill=county)) + geom_histogram(bins=10) +
ggtitle("true_pct_NHWA_2019")
5
01
2
3
50 60 70 80 90
true_pct_NHWA_2019
co
u
n
t
county
Barnstable
Berkshire
Bristol
Dukes
Essex
Franklin
Hampden
Hampshire
Middlesex
Nantucket
Norfolk
Plymouth
Suffolk
Worcester
true_pct_NHWA_2019
c
ma_extra$pct_nonwhite = 100 - ma_extra$pct_NHWA_2019
ma_extra %>% ggplot(aes(x=pct_nonwhite, fill=county)) + geom_histogram(bins=10) +
ggtitle("pct_nonwhite")
6
01
2
3
99.1 99.2 99.3 99.4 99.5
pct_nonwhite
co
u
n
t
county
Barnstable
Berkshire
Bristol
Dukes
Essex
Franklin
Hampden
Hampshire
Middlesex
Nantucket
Norfolk
Plymouth
Suffolk
Worcester
pct_nonwhite
d
mass_cov <- ma_extra %>% left_join(mass, by="county")
mass_cov$new_cases_per_100000 = mass_cov$new_cases / 100000
mass_cov$new_deaths_per_100000 = mass_cov$new_deaths / 100000
mass_cov$cases_per_100000 = mass_cov$cases / 100000
mass_cov$deaths_per_100000 = mass_cov$deaths / 100000
mass_cov$new_cases_per_100000_07_day = mass_cov$new_cases_07_day / 100000
mass_cov$new_deaths_per_100000_07_day = mass_cov$new_deaths_07_day / 100000
line1 <- mass_cov %>% filter(county != "Unknown" & new_cases_per_100000 >= 0) %>%
ggplot(aes(x=date, y=new_cases_per_100000, group=county, color=county)) +
geom_line() + ggtitle("new_cases_per_100000")
line2 <- mass_cov %>% filter(county != "Unknown" & new_deaths_per_100000 >= 0) %>%
ggplot(aes(x=date, y=new_deaths_per_100000, group=county, color=county)) +
geom_line() + ggtitle("new_deaths_per_100000")
line3 <- mass_cov %>% filter(county != "Unknown" & cases_per_100000 >= 0) %>%
ggplot(aes(x=date, y=cases_per_100000, group=county, color=county)) +
geom_line() + ggtitle("cases_per_100000")
line4 <- mass_cov %>% filter(county != "Unknown" & deaths_per_100000 >= 0) %>%
ggplot(aes(x=date, y=deaths_per_100000, group=county, color=county)) +
geom_line() + ggtitle("deaths_per_100000")
line5 <- mass_cov %>% filter(county != "Unknown" & new_cases_per_100000_07_day >= 0) %>%
ggplot(aes(x=date, y=new_cases_per_100000_07_day, group=county, color=county)) +
7
geom_line() + ggtitle("new_cases_per_100000_07_day")
line6 <- mass_cov %>% filter(county != "Unknown" & new_deaths_per_100000_07_day >= 0) %>%
ggplot(aes(x=date, y=new_deaths_per_100000_07_day, group=county, color=county)) +
geom_line() + ggtitle("new_deaths_per_100000_07_day")
grid.arrange(line1, line2, line3, line4, line5, line6, nrow=3)
0.00
0.05
0.10
2020−012020−072021−012021−072022−01
date
n
ew
_
ca
se
s_
pe
r_
10
00
00
county
Barnstable
Berkshire
Bristol
Dukes
Essex
Franklin
Hampden
Hampshire
Middlesex
Nantucket
Norfolk
Plymouth
Suffolk
Worcester
new_cases_per_100000
0e+00
2e−04
4e−04
6e−04
2020−012 20−072 21−012 21−072 22−01
date
n
ew
_
de
at
hs
_p
er
_1
00
00
0
county
Barnstable
Berkshire
Bristol
Dukes
Essex
Franklin
Hampden
Hampshire
Middlesex
Nantucket
Norfolk
Plymouth
Suffolk
Worcester
new_deaths_per_100000
0
1
2
3
2020−012020−072021−012021−072022−01
datec
a
se
s_
pe
r_
10
00
00
county
Barnstable
Berkshire
Bristol
Dukes
Essex
Franklin
Hampden
Hampshire
Middlesex
Nantucket
Norfolk
Plymouth
Suffolk
Worcester
cases_per_100000
0.00
0.01
0.02
0.03
0.04
2020−012020−072021−012021−072022−01
datede
at
hs
_p
er
_1
00
00
0
county
Barnstable
Berkshire
Bristol
Dukes
Essex
Franklin
Hampden
Hampshire
Middlesex
Nantucket
Norfolk
Plymouth
Suffolk
Worcester
deaths_per_100000
0.00
0.01
0.02
0.03
0.04
0.05
2020−012020−072021−012021−072022−01
date
n
ew
_
ca
se
s_
pe
r_
10
00
00
_0
7_
da
y
county
Barnstable
Berkshire
Bristol
Dukes
Essex
Franklin
Hampden
Hampshire
Middlesex
Nantucket
Norfolk
Plymouth
Suffolk
Worcester
new_cases_per_100000_07_day
0e+00
1e−04
2e−04
3e−04
4e−04
2020−012 20−072 21−012 21−072 22−01
date
n
ew
_
de
at
hs
_p
er
_1
00
00
0_
07
_d
ay
county
Barnstable
Berkshire
Bristol
Dukes
Essex
Franklin
Hampden
Hampshire
Middlesex
Nantucket
Norfolk
Plymouth
Suffolk
Worcester
new_deaths_per_100000_07_day
e
mass_cov_des_cases <- mass_cov %>%
filter(county != "Unknown" & cases_per_100000 >= 0) %>%
arrange(desc(cases_per_100000))
mass_cov_des_deaths <- mass_cov %>%
filter(county != "Unknown" & deaths_per_100000 >= 0) %>%
arrange(desc(deaths_per_100000))
Middlesex county has higher rate of deaths compare to the rate of cases. Middlesex has higher population
density.
8
3a
library(lubridate)
##
## Attaching package: ’lubridate’
## The following objects are masked from ’package:base’:
##
## date, intersect, setdiff, union
mass_cov$month = month(mass_cov$date, label=TRUE)
mass_cov$year = year(mass_cov$date)
b
mass_cov %>% ggplot(aes(x=month, y=new_cases_per_100000)) + geom_boxplot() +
facet_wrap(~year)
2020 2021 2022
JanFebMarAprMayJunJulAugSepOctNovDec JanFebMarAprMayJunJulAugSepOctNovDec JanFebMarAprMayJunJulAugSepOctNovDec
0.00
0.05
0.10
month
n
ew
_
ca
se
s_
pe
r_
10
00
00
9
4library(usmap)
plot_usmap("counties", data=mass, include = c("MA"), labels=TRUE, values="new_cases")
Barnstable
Berkshire
Bristol
Dukes
Essex
Franklin
Hampden
Hampshire
Middlesex
Nantucket
Norfolk Plymouth
Suffolk
Worcester
0
5000
10000
map_df[, values]
plot_usmap("counties", data=mass_cov, include = c("MA"), labels=TRUE, values="TOT_FEMALE_2019")
10
Barnstable
Berkshire
Bristol
Dukes
Essex
Franklin
Hampden
Hampshire
Middlesex
Nantucket
Norfolk Plymouth
Suffolk
Worcester
2e+05
4e+05
6e+05
8e+05
map_df[, values]
Areas with higher population has more cases of COVID19.
##5 Inner join only preserves the common data of A and B, where left join excludes only the part doesn’t
share with A. We want to preserve the data in the original table so we used left_join.
##6 People might not know they have COVID so they did not go to testing.
##7 Every column is a variable. Every row is an observation. Every cell is a single value.
11
HW2
Yuxian Chen
2/7/2022
1
A linegraph is better at showing the trend.
2
a
library(ggplot2)
## Warning in register(): Can’t find generic ‘scale_type‘ in package ggplot2 to
## register S3 method.
library(dplyr)
##
## Attaching package: ’dplyr’
## The following objects are masked from ’package:stats’:
##
## filter, lag
## The following objects are masked from ’package:base’:
##
## intersect, setdiff, setequal, union
library(tinytex)
library(gridExtra)
##
## Attaching package: ’gridExtra’
## The following object is masked from ’package:dplyr’:
##
## combine
1
load(file="MA_COVID19_22_02_01.RData")
load(file="MA_county_variables_2020.RData")
There is 1 object called “ma_extra” in the data window.
b
Categorical: “county_FIPS” and “Urban_rural” Numerical: “X2019_unemployment” and “X2019_pop_est”
c
hist1<-ma_extra %>% ggplot(aes(x=X2019_unemployment)) +
geom_histogram(bins=10) + ggtitle("2019_unemployment") +
labs(x="Unemployment Rate")
hist2<-ma_extra %>% ggplot(aes(x=Median_Household_Income_2018)) +
geom_histogram(bins=10) + ggtitle("Median_Household_Income_2018") +
labs(x="Income")
hist3<-ma_extra %>% ggplot(aes(x=X2019_pop_est)) +
geom_histogram(bins=10) + ggtitle("X2019_pop_est") +
labs(x="Population")
hist4<-ma_extra %>% ggplot(aes(x=X2019_age_85_plus)) +
geom_histogram(bins=10) + ggtitle("X2019_age_85_plus") +
labs(x="# people")
hist5<-ma_extra %>% ggplot(aes(x=X2019_median_age)) +
geom_histogram(bins=10) + ggtitle("X2019_median_age") +
labs(x="Median age")
hist6<-ma_extra %>% ggplot(aes(x=X2019_housing_units)) +
geom_histogram(bins=10) + ggtitle("X2019_housing_units") +
labs(x="# housing units")
grid.arrange(hist1,hist2,hist3,hist4,hist5,hist6,nrow=3)
2
0.0
0.5
1.0
1.5
2.0
2.5 3.0 3.5 4.0
Unemployment Rate
co
u
n
t
2019_unemployment
0
1
2
3
5e+04 6e+04 7e+04 8e+04 9e+04 1e+05
Income
co
u
n
t
Median_Household_Income_2018
0
1
2
3
0 500000 1000000 1500000
Population
co
u
n
t
X2019_pop_est
0
1
2
3
4
0 10000 20000 30000
# people
co
u
n
t
X2019_age_85_plus
0
1
2
3
4
5
35 40 45 50 55
Median age
co
u
n
t
X2019_median_age
0
1
2
3
0e+00 2e+05 4e+05 6e+05
# housing units
co
u
n
t
X2019_housing_units
### d
scat1<-ma_extra %>% ggplot(aes(x=X2019_pop_est, y=TOT_MALE_2019)) +
geom_point() + ggtitle("Population vs # male") + labs(x="Population",
y="# males")
scat2<-ma_extra %>% ggplot(aes(x=Median_Household_Income_2018, y=X2019_unemployment)) +
geom_point() + ggtitle("Unemployment vs median household income") +
labs(x="median household income", y="unemployment rate")
scat3<-ma_extra %>% ggplot(aes(x=X2019_median_age, y=X2019_age_85_plus)) +
geom_point() + ggtitle("Median age vs age 85 plus") +
labs(x="Median age", y="Age 85 plus")
scat4<-ma_extra %>% ggplot(aes(x=X2019_housing_units, y=X2019_unemployment)) +
geom_point() + ggtitle("Housing units vs unemployment") +
labs(x="# housing units", y="unemployment rate")
grid.arrange(scat1,scat2,scat3,scat4, nrow=2)
3
0e+00
2e+05
4e+05
6e+05
8e+05
0 500000 1000000 1500000
Population
#
m
al
es
Population vs # male
2.5
3.0
3.5
4.0
6e+04 7e+04 8e+04 9e+04 1e+05
median household income
u
n
e
m
pl
oy
m
en
t r
a
te
Unemployment vs median household income
0
10000
20000
30000
35 40 45 50 55
Median age
Ag
e
85
p
lu
s
Median age vs age 85 plus
2.5
3.0
3.5
4.0
0e+00 2e+05 4e+05 6e+05
# housing units
u
n
e
m
pl
oy
m
en
t r
a
te
Housing units vs unemployment
e
ma_extra %>% ggplot(aes(x=X2019_pop_est, y=TOT_MALE_2019,
color=Median_Household_Income_2018)) +
geom_point() + ggtitle("Population vs # male vs median income") +
labs(x="Population", y="# male")
4
0e+00
2e+05
4e+05
6e+05
8e+05
0 500000 1000000 1500000
Population
#
m
al
e
6e+04
7e+04
8e+04
9e+04
1e+05
Median_Household_Income_2018
Population vs # male vs median income
### f
scat2<-ma_extra %>% ggplot(aes(x=Median_Household_Income_2018, y=X2019_unemployment)) +
geom_point() + ggtitle("Unemployment vs median household income") +
labs(x="median household income", y="unemployment rate") +
geom_label(aes(label=county))
scat2
5
Barnstable
Berkshire
Bristol
Dukes
Essex
Franklin
Hampden
Hampshire
Middlesex
Nantucket
Norfolk
Plymouth
Suffolk
Worcester
2.5
3.0
3.5
4.0
6e+04 7e+04 8e+04 9e+04 1e+05
median household income
u
n
e
m
pl
oy
m
en
t r
a
te
Unemployment vs median household income
### g 1. Nantucket has high unemployment rate but high median household income. 2. Seems like
percentage of males is increasing with population.
3
a
mass_cov<-mass %>% left_join(ma_extra, by="county")
b
The variable “county” should match in both datasets to help joining. csv stands for comma separated file,
but commas may cause trouble when reading into R.
c
mass_cov %>% filter(county!="Unknown" & new_cases>=0) %>% ggplot(aes(x=date,
y=new_cases,
group=county,
color=Urban_rural)) +
geom_line() + ggtitle("New cases by county") + labs(x="Date", y="New cases")
6
05000
10000
2020−01 2020−07 2021−01 2021−07 2022−01
Date
N
ew
c
a
se
s Urban_rural
rural
urban
New cases by county
There are much more cases in urban areas.
d
mass_cov %>% filter(date==max(date)) %>% ggplot(aes(x=X2019_pop_est, y=cases)) +
geom_point() + ggtitle("Newest cases by population") + labs(x="Population",
y="Cases")
## Warning: Removed 1 rows containing missing values (geom_point).
7
0e+00
1e+05
2e+05
3e+05
0 500000 1000000 1500000
Population
Ca
se
s
Newest cases by population
Total cases are positively correlated with population.
e
Number of cases relates stronger with population but not population density.
4
mass %>% filter(new_cases>=0) %>% ggplot(aes(x=date, y=new_cases)) +
geom_line() + facet_wrap(~county) + labs(x="date", y="new cases")
8
Suffolk Unknown Worcester
Middlesex Nantucket Norfolk Plymouth
Essex Franklin Hampden Hampshire
Barnstable Berkshire Bristol Dukes
2020−012020−072021−012021−072022−01020−012020−072021−012021−072022−01020−012020−072021−012021−072022−01
2020−012020−072021−012021−072022−01
0
5000
10000
0
5000
10000
0
5000
10000
0
5000
10000
date
n
ew
c
a
se
s
### b
mass %>% filter(new_cases>=0) %>% ggplot(aes(x=date, y=new_cases)) + geom_line() +
facet_wrap(~county, scales="free") + labs(x="date", y="new cases")
9
Suffolk Unknown Worcester
Middlesex Nantucket Norfolk Plymouth
Essex Franklin Hampden Hampshire
Barnstable Berkshire Bristol Dukes
2020−012020−072021−012021−072022−01 2020−072 21−012 21−072 22−01 2020−072 21−012 21−072 22−01
2020−072 21−012 21−072 22−01 2020−072 21−012 21−072 22−01 2020−072 21−012 21−072 22−01 2020−072 21−012 21−072 22−01
2020−072 21−012 21−072 22−01 2020−072 21−012 21−072 22−01 2020−072 21−012 21−072 22−01 2020−072 21−012 21−072 22−01
2020−072 21−012 21−072 22−01 2020−072 21−012 21−072 22−01 2020−072 21−012 21−072 22−01 2020−072 21−012 21−072 22−01
0
50
100
150
0
300
600
900
0
1000
2000
3000
4000
0
1000
2000
3000
4000
5000
0
2000
4000
6000
0
1000
2000
3000
4000
5000
0
2000
4000
6000
8000
0
250
500
750
1000
0
200
400
600
0
50
100
150
0
1000
2000
3000
4000
5000
0
250
500
750
1000
0
2000
4000
6000
0
5000
10000
0
2500
5000
7500
date
n
ew
c
a
se
s
plot a has uniform y scale so we can compare between counties. plot b shows the trend within each county
more clearly.
5
a
ma_extra %>% ggplot(aes(x=Rural_urban_Continuum_Code_2013)) + geom_bar() +
ggtitle("Rural_urban_Continuum_Code_2013") + labs(x="code",y="#")
10
02
4
6
2 4 6
code
#
Rural_urban_Continuum_Code_2013
ma_extra %>% ggplot(aes(x=Urban_Influence_Code_2013)) + geom_bar() +
ggtitle("Urban_Influence_Code_2013") + labs(x="code",y="#")
11
02
4
6
3 6 9
code
#
Urban_Influence_Code_2013
### b
ma_extra %>% ggplot(aes(x=Rural_urban_Continuum_Code_2013, fill=as.factor(Urban_Influence_Code_2013))) +
geom_bar() + ggtitle("Rural vs Urban") +
labs(x="Rural_urban_Continuum_Code_2013",y="#")
12
02
4
6
2 4 6
Rural_urban_Continuum_Code_2013
#
as.factor(Urban_Influence_Code_2013)
1
2
5
8
11
Rural vs Urban
### d
mass %>% filter(date==max(date)) %>% ggplot(aes(x=county, y=cases)) +
geom_col() + ggtitle("cases by county") +
theme(axis.text.x = element_text(angle = 90))
13
0e+00
1e+05
2e+05
3e+05
Ba
rn
st
ab
le
Be
rk
sh
ire
Br
is
to
l
D
uk
e
s
Es
se
x
Fr
a
n
kl
in
H
am
pd
en
H
am
ps
hi
re
M
id
dl
es
ex
N
an
tu
ck
e
t
N
or
fo
lk
Pl
ym
ou
th
Su
ffo
lk
Un
kn
ow
n
W
o
rc
e
st
er
county
ca
se
s
cases by county
### e Number of cases was counted in mass but not ma_extra.
6
a
ma_extra %>% ggplot(aes(x=Urban_rural, y=people_per_Housing)) + geom_boxplot() +
labs(y="People_per_housing", x="Urban_rural")
14
1.0
1.5
2.0
2.5
rural urban
Urban_rural
Pe
o
pl
e_
pe
r_
ho
us
in
g
### b Boxes in boxplots show 25% 50% 75% percentiles. Whiskers extends to 1.5IQR, with any outliers
shown as points.
15