# Clear environment, plot pane, and console
rm(list = ls())
graphics.off()
cat("\014")
# If pacman is not already installed, then install it
if (!require(pacman)) install.packages("pacman")
# Load packages
pacman::p_load(ggplot2, data.table, sandwich, lmtest, gridExtra)
# Colors used for plot (https://www.rapidtables.com/web/color/RGB_Color.html)
color_1 <- "#00FF00"
color_2 <- "#ADD8E6"
# Set random seed for reproducibility
set.seed(418518)
# Disable scientific notation
options(scipen = 999)
Notice in the above code we loaded in the sandwich
package. This package allows us to get heteroskedasticity-consistent
estimation results so we will need it throughout this code. Moreover, we
will need the lmtest
package as it allows our regression
summary to be displayed with heteroskedasticity-consistent standard
errors. Lastly, the gridExtra
package allows images to be
placed side by side or on top of each other.
Let’s visualize what heteroskedasticity looks like. Remember, heteroskedasticity occurs when the variance of the error is individual specific. We’ll simulate some data and then plot a regression line that uses data based on homoskedastic and heteroskedastic errors to juxtapose the difference.
# Number of data points to generate per sample
n <- 200
# Intercept term
beta_0 <- 1
# Slope term
beta_1 <- 2
# Regressor
x <- rnorm(n, mean = 5, sd = 2)
# Error
u_hetero <- rnorm(n, mean = 0, sd = abs(1 + 1.5 * x))
u_homo <- rnorm(n, mean = 0, sd = 1)
# Outcome
y_hetero <- beta_0 + beta_1 * x + u_hetero
y_homo <- beta_0 + beta_1 * x + u_homo
# Upper and lower bounds for heteroskedasticity case
y_hetero_upper <- beta_0 + beta_1 * x + 2 * abs(1 + 1.5 * x)
y_hetero_lower <- beta_0 + beta_1 * x - 2 * abs(1 + 1.5 * x)
# Upper and lower bounds for homoskedasticity case
y_homo_upper <- beta_0 + beta_1 * x + 2
y_homo_lower <- beta_0 + beta_1 * x - 2
# Create data table
dt <- data.table(y_hetero, y_hetero_upper, y_hetero_lower, y_homo, y_homo_upper, y_homo_lower, x)
# Plot for heteroskedasticity case
plot_hetero <- ggplot(data = dt, aes(x = x, y = y_hetero)) +
geom_point(color = color_1) +
geom_line(aes(y = y_hetero_upper), color = color_2, size = 2) +
geom_line(aes(y = y_hetero_lower), color = color_2, size = 2) +
labs(title = "Heteroskedasticity Case", x = "X", y = "Y") +
theme(
panel.background = element_rect(fill = "black", color = "black"),
plot.background = element_rect(fill = "black", color = "black"),
panel.grid.major = element_line(color = "gray"),
panel.grid.minor = element_line(color = "gray"),
axis.text = element_text(color = color_1, size = 5),
axis.title = element_text(color = color_2, size = 10),
plot.title = element_text(hjust = 0.5, color = color_2, size = 15, face = "bold")
)
# Plot for homoskedasticity case
plot_homo <- ggplot(data = dt, aes(x = x, y = y_homo)) +
geom_point(color = color_1) +
geom_line(aes(y = y_homo_upper), color = color_2, size = 2) +
geom_line(aes(y = y_homo_lower), color = color_2, size = 2) +
labs(title = "Homoskedasticity Case", x = "X", y = "Y") +
theme(
panel.background = element_rect(fill = "black", color = "black"),
plot.background = element_rect(fill = "black", color = "black"),
panel.grid.major = element_line(color = "gray"),
panel.grid.minor = element_line(color = "gray"),
axis.text = element_text(color = color_1, size = 8),
axis.title = element_text(color = color_2, size = 10),
plot.title = element_text(hjust = 0.5, color = color_2, size = 15, face = "bold")
)
# Combine the plots side by side
grid.arrange(plot_hetero, plot_homo, nrow = 1)
We can see that when the errors are heteroskedastic the residuals seem to get larger has \(x\) becomes larger. This means individuals with larger values of \(x\) have larger residuals implying the error term may very well be individual specific. Comparing this to the homoskedasticity case on the right, we see the residuals are fairly constant across the board indicating the variance of the errors is constant for each individual.
sandwich
and lmtest
Packages for HCSER has nice functions within the sandwich
and
lmtest
packages for computing HCSE. Whenever you run a regression from now on, you
should be using this functionality so you no longer need to worry about
incorrectly estimated standard errors in large samples.
Let’s see how we can use these packages with the hprice1
data set from wooldridge. This dataset contains housing price data as
well as a host of covariates.
# Load in hprice1 dataset
dt <- data.table(wooldridge::hprice1)
# Only keep desired columns
dt <- dt[, .(price, lotsize, sqrft, bdrms)]
# Show the first few rows of the data
head(dt)
# Show the last few rows of the data
tail(dt)
Let’s estimate the model \(price_i = \beta_0 + \beta_1 lotsize_i + \beta_2 sqrft_i + \beta_3 bdrms_i + u_i\) without using HCSE like we have been doing.
# Run regression
reg <- lm(price ~ lotsize + sqrft + bdrms, data = dt)
# Show regression summary
summary(reg)
##
## Call:
## lm(formula = price ~ lotsize + sqrft + bdrms, data = dt)
##
## Residuals:
## Min 1Q Median 3Q Max
## -120.026 -38.530 -6.555 32.323 209.376
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -21.7703081 29.4750419 -0.739 0.46221
## lotsize 0.0020677 0.0006421 3.220 0.00182 **
## sqrft 0.1227782 0.0132374 9.275 0.0000000000000166 ***
## bdrms 13.8525217 9.0101454 1.537 0.12795
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 59.83 on 84 degrees of freedom
## Multiple R-squared: 0.6724, Adjusted R-squared: 0.6607
## F-statistic: 57.46 on 3 and 84 DF, p-value: < 0.00000000000000022
Now, we’ll estimate the same model using HC0 heteroskedasticity corrections.
# Obtain HC0 standard errors
HC0 <- vcovHC(reg, type = "HC0")
# Show HC0 standard errors
sqrt(diag(HC0))
## (Intercept) lotsize sqrft bdrms
## 36.284344446 0.001222652 0.017317800 8.283687986
# Show regression summary using HC0 standard errors
coeftest(reg, vcov = HC0)
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -21.7703081 36.2843444 -0.6000 0.55013
## lotsize 0.0020677 0.0012227 1.6912 0.09451 .
## sqrft 0.1227782 0.0173178 7.0897 0.0000000003883 ***
## bdrms 13.8525217 8.2836880 1.6723 0.09819 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Notice the stark difference in standard errors and, consequently, T-statistics and p-values. This is particular notable for the parameter estimate on lot size. Using the inconsistent standard errors, we would be able to reject a null hypothesis of \(\beta_2 = 0\) at the \(\alpha = 0.01\) level, but using the HC0 corrected standard errors, we wouldn’t even be able to reject this null hypothesis at the \(\alpha = 0.05\) level.
We can do the same thing for HC1 corrections. Since the HC1 correction is the HC0 correction scaled by \(\frac{n}{n-k}\), we should expect the HC1 standard errors to be larger than that produced by HC0.
# Obtain HC1 standard errors
HC1 <- vcovHC(reg, type = "HC1")
# Show HC1 standard errors
sqrt(diag(HC1))
## (Intercept) lotsize sqrft bdrms
## 37.138210550 0.001251424 0.017725334 8.478624962
# Show regression summary using HC1 standard errors
coeftest(reg, vcov = HC1)
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -21.7703081 37.1382106 -0.5862 0.5593
## lotsize 0.0020677 0.0012514 1.6523 0.1022
## sqrft 0.1227782 0.0177253 6.9267 0.0000000008096 ***
## bdrms 13.8525217 8.4786250 1.6338 0.1060
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Indeed, the standard errors are larger than HC0 and are even more different that the inconsistent ones produced normally by OLS.
Let us now create a simulation showing that the usual estimator \(\widehat{\sigma}^2 (X^\prime X)^{-1}\) is inconsistent in the case of heteroskedasticity when \(\mathbb{V}\left[ \widehat{\boldsymbol{\beta}} \right] = (X^\prime X)^{-1} X^\prime \text{diag}\left(\sigma_i^2 \right) X (X^\prime X)^{-1}\). We’ll then show that the corrections HC0 and HC1 are consistent, with HC1 converging to the truth faster. Recall that the HC0 and HC1 corrections are given by \[\begin{align} \widehat{\mathbb{V}}_{HC0} \left[ \widehat{\boldsymbol{\beta}} \right] &= \left[ (X^\prime X)^{-1} \left(X^\prime \text{diag} \left(\widehat{\sigma}_i^2 \right) X \right) (X^\prime X)^{-1} \right] \\ \widehat{\mathbb{V}}_{HC1} \left[ \widehat{\boldsymbol{\beta}} \right] &= \frac{n}{n-k} \left[ (X^\prime X)^{-1} \left( X^\prime \text{diag} \left(\widehat{\sigma}_i^2 \right) X \right) (X^\prime X)^{-1} \right] \\ &= \frac{n}{n-k} \widehat{\mathbb{V}}_{HC0} \left[ \widehat{\boldsymbol{\beta}} \right] \end{align}\]
# Number of samples
s <- 1000
# Number of data points to generate per sample
n <- c(50, 200, 1000)
# Number of estimated parameters
k <- 3
# Initialize empty list to store standard error of beta_1 depending on the number of observations
estimates_list <- vector("list", length(n))
names(estimates_list) <- paste0("n = ", n)
# True parameters
beta_0 <- 1
beta_1 <- 2
beta_2 <- 3
# We want to create i = 1, 2, ..., s samples
for (i in 1:s)
{
# We want a differing number of observations per sample
for (j in 1:length(n))
{
# Generate data (u is heteroskedastic)
x_1 <- rnorm(n[j], mean = 5, sd = 2)
x_2 <- rnorm(n[j], mean = 0, sd = 1)
u <- rnorm(n[j], mean = 0, sd = abs(1 + 1.5 * x_1))
y <- beta_0 + beta_1 * x_1 + beta_2 * x_2 + u
# Create feature matrix and its transpose
X <- cbind(Intercept = rep(1, n[j]), x_1, x_2)
X_prime <- t(X)
# Get true var-cov matrix of beta_hat
v_true <- solve(X_prime %*% X) %*% (X_prime %*% diag(u^2) %*% X) %*% solve(X_prime %*% X)
# Obtain standard error of beta_1 created by v_true
v_true_beta_1_se <- sqrt(diag(v_true)["x_1"])
# Compute OLS estimator
beta_hat <- solve(X_prime %*% X) %*% X_prime %*% y
# Compute residuals
u_hat <- as.vector(y - X %*% beta_hat)
# Estimator of variance of residuals
var_hat_u_hat <- sum(u_hat^2) / (n[j] - k)
# Inconsistent var-cov matrix estimator
v <- var_hat_u_hat * solve(X_prime %*% X)
# Obtain standard error of beta_1 created by v
v_beta_1_se <- sqrt(diag(v)["x_1"])
# HCO
v_HCO <- solve(X_prime %*% X) %*% (X_prime %*% diag(u_hat^2) %*% X) %*% solve(X_prime %*% X)
# Obtain standard error of beta_hat_1 created by v_HC0
v_HC0_beta_1_se <- sqrt(diag(v_HCO)["x_1"])
# HC1
v_HC1 <- (n[j] / (n[j] - k)) * v_HCO
# Obtain standard error of beta_hat_1 created by v_HC1
v_HC1_beta_1_se <- sqrt(diag(v_HC1)["x_1"])
# Store estimate of standard error of beta_1 for each var-cov estimator
estimates_list[[j]] <- rbind(estimates_list[[j]], c(v_true_beta_1_se, v_beta_1_se, v_HC0_beta_1_se, v_HC1_beta_1_se))
}
}
# Combine estimates into a single data.table for plotting
dt_plot <- rbindlist(lapply(seq_along(estimates_list), function(i) {
data.table(
Sample_Size = names(estimates_list)[i],
True_SE = estimates_list[[i]][, 1],
OLS_SE = estimates_list[[i]][, 2],
HC0_SE = estimates_list[[i]][, 3],
HC1_SE = estimates_list[[i]][, 4]
)
}))
# Ensure sample size is a factor with levels in the desired order
dt_plot[, Sample_Size := factor(Sample_Size, levels = paste0("n = ", n))]
# Reshape data for plotting
dt_melt <- melt(dt_plot, id.vars = "Sample_Size", variable.name = "Estimator", value.name = "Estimate")
# Calculate the mean of of the true standard error of beta_hat_1 by sample size
mean_true_se <- dt_melt[Estimator == "True_SE", .(Mean_True_SE = mean(Estimate)), by = Sample_Size]
# Extract mean true standard error of beta_hat_1 for each sample size
mean_true_se_n50 <- mean_true_se[Sample_Size == "n = 50", Mean_True_SE]
mean_true_se_n200 <- mean_true_se[Sample_Size == "n = 200", Mean_True_SE]
mean_true_se_n1000 <- mean_true_se[Sample_Size == "n = 1000", Mean_True_SE]
# Create a data table for the vertical lines
vline_data <- data.table(
xintercept = c(mean_true_se_n50, mean_true_se_n200, mean_true_se_n1000),
Sample_Size = factor(c("n = 50", "n = 200", "n = 1000"), levels = paste0("n = ", n))
)
# Plot the sampling distributions for inconsistent standard error estimator of beta_hat_1
ggplot(dt_melt[Estimator == "OLS_SE"], aes(x = Estimate, fill = Sample_Size)) +
geom_density(alpha = 0.7, adjust = 1) +
geom_vline(data = vline_data, aes(xintercept = xintercept, color = Sample_Size, linetype = "Mean True SE"), size = 1) +
labs(title = expression(paste("Sampling Distribution for OLS Standard Error of ", hat(beta)[1]))) +
scale_fill_manual(
values = c("#FFA07A", "#20B2AA", "#778899"),
labels = c(paste0("n = ", n[1]), paste0("n = ", n[2]), paste0("n = ", n[3])),
name = "Sample Size"
) +
scale_color_manual(
values = c("n = 50" = "#FFA07A", "n = 200" = "#20B2AA", "n = 1000" = "#778899"),
name = "Sample Size"
) +
scale_linetype_manual(
name = "Vertical Lines",
values = c("Mean True SE" = "dashed"),
labels = c("Mean True SE of " ~ hat(beta)[1])
) +
theme(
panel.background = element_rect(fill = "black", color = "black"),
plot.background = element_rect(fill = "black", color = "black"),
panel.grid.major = element_line(color = "gray"),
panel.grid.minor = element_line(color = "gray"),
axis.text = element_text(color = color_1, size = 10),
axis.title = element_text(color = color_2, size = 15),
plot.title = element_text(hjust = 0.5, color = color_2, size = 15, face = "bold"),
legend.background = element_rect(fill = "black"),
legend.text = element_text(color = color_1, size = 10),
legend.title = element_text(color = color_2, size = 15)
) +
xlab(expression(paste("OLS SE of ", hat(beta)[1]))) +
ylab("Density")
As we can see, even for large values of \(n\), the regular OLS standard error estimator of \(\widehat{\sigma}^2 (X^\prime X)^{-1}\) underestimates the true standard error of \(\widehat{\beta}_1\) and the sampling distribution is consistently centered around the wrong value. In fact, the sampling distribution gets tighter around the wrong value as \(n \to \infty\). This is evidenced by the sampling distribution of these standard error estimates being to the left of the vertical line for each value of \(n\).
# Plot the sampling distributions for HCO estimator of standard error of beta_hat_1
ggplot(dt_melt[Estimator == "HC0_SE"], aes(x = Estimate, fill = Sample_Size)) +
geom_density(alpha = 0.7) +
geom_vline(data = vline_data, aes(xintercept = xintercept, color = Sample_Size, linetype = "Mean True SE"), size = 1) +
labs(title = expression(paste("Sample Distribution for HC0 SE of ", hat(beta)[1]))) +
scale_fill_manual(
values = c("#FFA07A", "#20B2AA", "#778899"),
labels = c(paste0("n = ", n[1]), paste0("n = ", n[2]), paste0("n = ", n[3])),
name = "Sample Size"
) +
scale_color_manual(
values = c("n = 50" = "#FFA07A", "n = 200" = "#20B2AA", "n = 1000" = "#778899"),
name = "Sample Size"
) +
scale_linetype_manual(
name = "Vertical Lines",
values = c("Mean True SE" = "dashed"),
labels = c("Mean True SE of " ~ hat(beta)[1])
) +
theme(
panel.background = element_rect(fill = "black", color = "black"),
plot.background = element_rect(fill = "black", color = "black"),
panel.grid.major = element_line(color = "gray"),
panel.grid.minor = element_line(color = "gray"),
axis.text = element_text(color = color_1, size = 10),
axis.title = element_text(color = color_2, size = 15),
plot.title = element_text(hjust = 0.5, color = color_2, size = 15, face = "bold"),
legend.background = element_rect(fill = "black"),
legend.text = element_text(color = color_1, size = 10),
legend.title = element_text(color = color_2, size = 15)
) +
xlab(expression(paste("HC0 SE of ", hat(beta)[1]))) +
ylab("Density")
The sampling distribution of the HC0 estimator gets closer to the truth as \(n\) increases. When \(n = 1000\) this sampling distribution is tightly centered around the truth as we desire. This shows the HC0 estimator is consistent. However, we should notice that for smaller samples the HC0 estimator does struggle by underestimating the true standard error. This is to say the estimator is biased, but asymptotically unbiased (consistent).
# Plot the sampling distributions for HC1 estimator of standard error of beta_hat_1
ggplot(dt_melt[Estimator == "HC1_SE"], aes(x = Estimate, fill = Sample_Size)) +
geom_density(alpha = 0.7) +
geom_vline(data = vline_data, aes(xintercept = xintercept, color = Sample_Size, linetype = "Mean True SE"), size = 1) +
labs(title = expression(paste("Sample Distribution for HC1 SE of ", hat(beta)[1]))) +
scale_fill_manual(
values = c("#FFA07A", "#20B2AA", "#778899"),
labels = c(paste0("n = ", n[1]), paste0("n = ", n[2]), paste0("n = ", n[3])),
name = "Sample Size"
) +
scale_color_manual(
values = c("n = 50" = "#FFA07A", "n = 200" = "#20B2AA", "n = 1000" = "#778899"),
name = "Sample Size"
) +
scale_linetype_manual(
name = "Vertical Lines",
values = c("Mean True SE" = "dashed"),
labels = c("Mean True SE of " ~ hat(beta)[1])
) +
theme(
panel.background = element_rect(fill = "black", color = "black"),
plot.background = element_rect(fill = "black", color = "black"),
panel.grid.major = element_line(color = "gray"),
panel.grid.minor = element_line(color = "gray"),
axis.text = element_text(color = color_1, size = 10),
axis.title = element_text(color = color_2, size = 15),
plot.title = element_text(hjust = 0.5, color = color_2, size = 15, face = "bold"),
legend.background = element_rect(fill = "black"),
legend.text = element_text(color = color_1, size = 10),
legend.title = element_text(color = color_2, size = 15)
) +
xlab(expression(paste("HC1 SE of ", hat(beta)[1]))) +
ylab("Density")
The sampling distribution of the HC1 estimator gets closer to the truth as \(n\) increases. When \(n = 1000\) this sampling distribution is tightly centered around the truth as we desire. This shows the HC1 estimator is consistent. However, we should notice that for smaller samples the HC1 estimator does struggle by underestimating the true standard error, but to a slightly lesser extent than that of HC0. This is to say the estimator is biased like HC0, but more asymptotically unbiased (consistent) then HC0.