# Clear environment, plot pane, and console
rm(list = ls())
graphics.off()
cat("\014")
# Set working directory
setwd("C:/Users/wbras/OneDrive/Desktop/UA/Fall_2024/ECON_418-518/ECON_418-518_Lecture_Slides/ECON_418-518_Lecture_Slides_E/ECON_418-518_E_09_MLR_Issues/ECON_418-518_E_09_MLR_Issues_Code")
# Install pacman
if (!require(pacman)) install.packages("pacman")
# Load packages
pacman::p_load(data.table, ggplot2, gridExtra)
# Colors used for plot (https://www.rapidtables.com/web/color/RGB_Color.html)
color_1 <- "#00FF00"
color_2 <- "#ADD8E6"
# Set seed for reproducibility
set.seed(418518)
# Disable scientific notation
options(scipen = 999)
We will construct simulations to show what happens to our estimators in the case of a functional form misspecified model.
Suppose the true underlying model is \(y_i = \beta_0 + u_i = 1 + u_i = 1 + u_i\), but we estimate \(y_i = \beta_0 + \beta_1 x_{i1} + u_i = 1 + 2 x_{i1} + u_i\). Since this is an example of model overspecification, the estimators of \(\beta_0\) and \(\beta_1\) will remain unbiased, but the estimator of \(\beta_0\) will have a higher variance relative to not including the \(x_1\) covariate. Let’s verify this by generating \(s = 1000\) samples each with \(n = 200\) data points, run the regression of \(y\) including only an intercept for each sample, and run the regression of \(y\) on a constant and \(x_1\) for each sample. If we are correct, then the sampling distribution of \(\widehat{\beta}_0\) should be centered at \(\beta_0 = 1\) with higher variance when \(x_1\) is included and the sampling distribution of \(\widehat{\beta}_1\) should be centered at \(\beta_1 = 0\).
# Number of samples
s <- 1000
# Number of data points to generate per sample
n <- 200
# Create a data table with empty values to store the parameter estimates
dt_estimates <- data.table(beta_hat_0_without_x_1 = rep(NA, s),
beta_hat_0_with_x_1 = rep(NA, s),
beta_hat_1 = rep(NA, s))
# Intercept term when y is the outcome
beta_0 <- 1
# Coefficient on x_1 when y is the outcome
beta_1 <- 0
# We want to create i = 1, 2, ..., s samples
for (i in 1:s)
{
# Error term when y is the outcome
u <- rnorm(n, mean = 0, sd = 2)
# x_1
x_1 <- rnorm(n, mean = 5, sd = 10)
# Outcome
y <- beta_0 + beta_1 * x_1 + u
# Create a data table
dt <- data.table(y = y, x_1 = x_1)
# Run regression of y on intercept only
reg_without_x_1 <- lm(y ~ 1, data = dt)
# Run regression of y on x_1
reg_with_x_1 <- lm(y ~ x_1, data = dt)
# Place the estimates in dt_estimates
dt_estimates$beta_hat_0_without_x_1[i] <- reg_without_x_1$coefficients["(Intercept)"]
dt_estimates$beta_hat_0_with_x_1[i] <- reg_with_x_1$coefficients["(Intercept)"]
dt_estimates$beta_hat_1[i] <- reg_with_x_1$coefficients["x_1"]
}
# Plot the density of the slope estimate
ggplot(dt_estimates, aes(x = beta_hat_1)) +
geom_density(color = color_2, fill = color_2, alpha = 0.7) +
geom_vline(xintercept = beta_1, color = color_1, linetype = "dashed", size = 1) +
labs(title = expression(paste("Sampling Distribution of ", hat(beta)[1]))) +
theme(
panel.background = element_rect(fill = "black", color = "black"),
plot.background = element_rect(fill = "black", color = "black"),
panel.grid.major = element_line(color = "gray"),
panel.grid.minor = element_line(color = "gray"),
axis.text = element_text(color = color_1, size = 15),
axis.title = element_text(color = color_2, size = 25),
plot.title = element_text(hjust = 0.5, color = color_2, size = 30, face = "bold")
) +
xlab(expression(hat(beta)[1])) +
ylab("Density")
We can see above that the sampling distribution of \(\widehat{\beta}_1\) is centered at \(\beta_1 = 0\) like we would expect. Now,
let’s investigate the sampling distribution of \(\widehat{\beta}_0\).
# Get the min and max values for both beta_hat_0_without_x_1 and beta_hat_0_with_x_1
x_min <- min(dt_estimates$beta_hat_0_without_x_1, dt_estimates$beta_hat_0_with_x_1)
x_max <- max(dt_estimates$beta_hat_0_without_x_1, dt_estimates$beta_hat_0_with_x_1)
# Plot the density of the intercept estimate when x_1 is excluded
beta_hat_0_without_x_1 <- ggplot(dt_estimates, aes(x = beta_hat_0_without_x_1)) +
geom_density(color = color_2, fill = color_2, alpha = 0.7) +
geom_vline(xintercept = beta_0, color = color_1, linetype = "dashed", size = 1) +
labs(title = expression(paste("Sampling Distribution of ", hat(beta)[0], " w/o x_1"))) +
theme(
panel.background = element_rect(fill = "black", color = "black"),
plot.background = element_rect(fill = "black", color = "black"),
panel.grid.major = element_line(color = "gray"),
panel.grid.minor = element_line(color = "gray"),
axis.text = element_text(color = color_1, size = 8),
axis.title = element_text(color = color_2, size = 10),
plot.title = element_text(hjust = 0.5, color = color_2, size = 15, face = "bold")
) +
xlab(expression(hat(beta)[0])) +
ylab("Density") +
xlim(x_min, x_max) # Set the same x-axis limits
# Plot the density of the intercept estimate when x_1 is included
beta_hat_0_with_x_1 <- ggplot(dt_estimates, aes(x = beta_hat_0_with_x_1)) +
geom_density(color = color_2, fill = color_2, alpha = 0.7) +
geom_vline(xintercept = beta_0, color = color_1, linetype = "dashed", size = 1) +
labs(title = expression(paste("Sampling Distribution of ", hat(beta)[0], " w/ x_1"))) +
theme(
panel.background = element_rect(fill = "black", color = "black"),
plot.background = element_rect(fill = "black", color = "black"),
panel.grid.major = element_line(color = "gray"),
panel.grid.minor = element_line(color = "gray"),
axis.text = element_text(color = color_1, size = 8),
axis.title = element_text(color = color_2, size = 10),
plot.title = element_text(hjust = 0.5, color = color_2, size = 15, face = "bold")
) +
xlab(expression(hat(beta)[0])) +
ylab("Density") +
xlim(x_min, x_max) # Set the same x-axis limits
# Arrange plots in a single figure
grid.arrange(beta_hat_0_without_x_1, beta_hat_0_with_x_1, nrow = 1)
When we include or exclude \(x_1\), the estimator of \(\widehat{\beta}_0\) has its sampling distribution centered around the true value \(\beta_0 = 1\) implying we get an unbiased estimator in either case. However, when we include \(x_1\), the distribution has slightly larger variance meaning the standard error should be larger. Let’s run the regressions to confirm this because it’s somewhat difficult to tell with the above image.
# Summary of regression of y on intercept and x_1
summary(lm(y ~ x_1, data = dt))
##
## Call:
## lm(formula = y ~ x_1, data = dt)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.3391 -1.4510 -0.1535 1.4105 4.8344
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.84906 0.14954 5.678 0.0000000481 ***
## x_1 0.02133 0.01370 1.558 0.121
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.975 on 198 degrees of freedom
## Multiple R-squared: 0.0121, Adjusted R-squared: 0.007114
## F-statistic: 2.426 on 1 and 198 DF, p-value: 0.1209
# Summary of regression of y on intercept
summary(lm(y ~ 1, data = dt))
##
## Call:
## lm(formula = y ~ 1, data = dt)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.5387 -1.4902 -0.0941 1.3045 4.7851
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.9323 0.1402 6.651 0.000000000275 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.982 on 199 degrees of freedom
Indeed, the standard error of \(\widehat{\beta}_0\) is larger when \(x_1\) is included. Notice also that the intercept term is statistically significant while the slope term is not as we would expect given the true underlying model \(y_i = \beta_0 + u_i\).
In sum, adding unnecessary covariates does not bias our estimators, but it inflates their variance and, consequently, their standard errors making reliable statistical inference more difficult.
Suppose the true underlying model is \(y_i = \beta_0 + x_{i1}^{\beta_1} + u_i = 1 + x_{i1}^2 + u_i\), but we estimate \(y_i = \beta_0 + \beta_1 x_{i1} + u_i = 1 + 2 x_{i1} + u_i\). Since this is an example of model misspecification, the estimators of \(\beta_0\) and \(\beta_1\) will both be biased. Let’s verify this by generating \(s = 1000\) samples each with \(n = 200\) data points, run the regression of \(y\) on an intercept term and the \(x_1\) covariate. If we are correct, then the sampling distributions of \(\widehat{\beta}_0\) should not be centered at \(\beta_0 = 1\) and the sampling distribution of \(\widehat{\beta}_2\) should not be centered at \(\beta_1 = 2\).
# Number of samples
s <- 1000
# Number of data points to generate per sample
n <- 200
# Create a data table with empty values to store the parameter estimates
dt_estimates <- data.table(beta_hat_0 = rep(NA, s),
beta_hat_1 = rep(NA, s))
# Intercept term when y is the outcome
beta_0 <- 1
# Coefficient on x_1 when y is the outcome
beta_1 <- 2
# We want to create i = 1, 2, ..., s samples
for (i in 1:s)
{
# Error term when y is the outcome
u <- rnorm(n, mean = 0, sd = 2)
# x_1
x_1 <- rnorm(n, mean = 3, sd = 2)
# Outcome
y <- beta_0 + x_1^beta_1 + u
# Create a data table
dt <- data.table(y = y, x_1 = x_1)
# Run regression of y on intercept and x_1
reg <- lm(y ~ x_1, data = dt)
# Place the estimates in dt_estimates
dt_estimates$beta_hat_0[i] <- reg$coefficients["(Intercept)"]
dt_estimates$beta_hat_1[i] <- reg$coefficients["x_1"]
}
# Plot the density of the intercept estimate
plot_beta_hat_0 <- ggplot(dt_estimates, aes(x = beta_hat_0)) +
geom_density(color = color_2, fill = color_2, alpha = 0.7) +
geom_vline(xintercept = beta_0, color = color_1, linetype = "dashed", size = 1) +
labs(title = expression(paste("Sampling Distribution of ", hat(beta)[0]))) +
theme(
panel.background = element_rect(fill = "black", color = "black"),
plot.background = element_rect(fill = "black", color = "black"),
panel.grid.major = element_line(color = "gray"),
panel.grid.minor = element_line(color = "gray"),
axis.text = element_text(color = color_1, size = 8),
axis.title = element_text(color = color_2, size = 10),
plot.title = element_text(hjust = 0.5, color = color_2, size = 15, face = "bold")
) +
xlab(expression(hat(beta)[0])) +
ylab("Density")
# Plot the density of the slope estimate
plot_beta_hat_1 <- ggplot(dt_estimates, aes(x = beta_hat_1)) +
geom_density(color = color_2, fill = color_2, alpha = 0.7) +
geom_vline(xintercept = beta_1, color = color_1, linetype = "dashed", size = 1) +
labs(title = expression(paste("Sampling Distribution of ", hat(beta)[1]))) +
theme(
panel.background = element_rect(fill = "black", color = "black"),
plot.background = element_rect(fill = "black", color = "black"),
panel.grid.major = element_line(color = "gray"),
panel.grid.minor = element_line(color = "gray"),
axis.text = element_text(color = color_1, size = 8),
axis.title = element_text(color = color_2, size = 10),
plot.title = element_text(hjust = 0.5, color = color_2, size = 15, face = "bold")
) +
xlab(expression(hat(beta)[1])) +
ylab("Density")
# Arrange plots in a single figure
grid.arrange(plot_beta_hat_0, plot_beta_hat_1, nrow = 1)
Indeed, the estimators of \(\widehat{\beta}_0\) and \(\widehat{\beta}_1\) are biased. Let’s run a regression and get the summary output to show this as well.
# Intercept term when y is the outcome
beta_0 <- 1
# Coefficient on x_1 when y is the outcome
beta_1 <- 2
# Error term when y is the outcome
u <- rnorm(n, mean = 0, sd = 2)
# x_1
x_1 <- rnorm(n, mean = 3, sd = 2)
# Outcome
y <- beta_0 + x_1^beta_1 + u
# Create a data table
dt <- data.table(y = y, x_1 = x_1)
# Run regression of y on intercept and x_1
reg <- lm(y ~ x_1, data = dt)
# Summary of regression
summary(reg)
##
## Call:
## lm(formula = y ~ x_1, data = dt)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.295 -4.543 -2.034 2.077 49.092
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.4568 0.9172 -2.679 0.00801 **
## x_1 5.7413 0.2468 23.267 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.755 on 198 degrees of freedom
## Multiple R-squared: 0.7322, Adjusted R-squared: 0.7308
## F-statistic: 541.3 on 1 and 198 DF, p-value: < 0.00000000000000022
Both estimators produce \(\widehat{\beta}_0 = -2.457\) and \(\widehat{\beta}_1 = 5.741\) which is nowhere the parameters they are estimating of \(\beta_0 = 1\) and \(\beta_1 = 2\).
Suppose the true underlying model is \(y_i = \beta_0 + x_{i1}^{\beta_1} + u_i = 1 + x_{i1}^2 + u_i\), but we estimate \(y_i = \beta_0 + \beta_1 x_{i1} + u_i = 1 + 2 x_{i1} + u_i\). Since this is an example of model misspecification, we can test this via a RESET (Regression Specification Error Test) Test. Remember, to perform a RESET Test we:
Rejection of the null hypothesis implies a model misspecification
issue and the F-statistic has two numerator degrees of freedom and \(n-k-2\) denominator degrees of freedom.
Let’s generate some data, estimate the incorrect the model, and carry
out the RESET Test at the \(\alpha =
0.05\) level. We’ll take advantage of R’s helpful
annova()
function to carry out the F-test for us.
# Intercept term when y is the outcome
beta_0 <- 1
# Coefficient on x_1 when y is the outcome
beta_1 <- 2.5
# Error term when y is the outcome
u <- rnorm(n, mean = 0, sd = 2)
# x_1
x_1 <- rnorm(n, mean = 2, sd = 2)
# Outcome
y <- beta_0 + x_1^beta_1 + u
# Create a data table
dt <- data.table(y = y, x_1 = x_1)
# Run regression restricted regression of y on intercept and x_1
reg_r <- lm(y ~ x_1, data = dt)
# Get fitted values
y_hat <- fitted(reg)
# Add squared and cubed fitted values to dt
dt[, `:=` (y_hat_sq = y_hat^2, y_hat_cube = y_hat^3)]
# Run unrestricted regression of y on intercept, x_1, y_hat_sq, and y_hat_cube
reg_ur <- lm(y ~ x_1 + y_hat_sq + y_hat_cube, data = dt)
# Perform F-test
anova(reg_r, reg_ur)
Notice the p-value in the second row of the above table is \(0.03 < \alpha = 0.05\). Hence, we reject the null hypothesis and conclude we have a misspecification issue. This is expected given the underlying data generating process (DGP) of \(y_i = \beta_0 + x_{i1}^{\beta_1} + u_i\).
We discussed scenarios under which we have an omitted variable bias (OVB) issue. Let’s construct two situations with OVB to understand this better.
The true underlying model is \(y_i = \beta_0 + \beta_1 x_{i1} + \beta_2 x_{i2} + u_i = 1 + 2 x_{1i} + 3x_{2i} + u_i\), but suppose we don’t include \(x_2\) (perhaps we don’t have data on it which is often an issue when doing empirical work). We know \(x_2\) is positively related to \(y\) as indicated by its positive coefficient in the model \(\left(\beta_2 = 3 \right)\). Moreover, suppose \(x_{1i} = \widetilde{\beta}_0 + \widetilde{\beta}_1 x_{2i} + v_i = 4 + 5 x_{2i} + v_i\) so \(x_{2}\) is also positively related to \(x_1\) as indicated by its positive coefficient in the model \(\left(\widetilde{\beta}_1 = 5 \right)\). Consequently, \(x_2\) is positively related with \(y\) and \(x_1\) meaning when we estimate the model \(y_i = \beta_0 + \beta_1 x_{i1} + w_i\), \(\beta_1\) should be overestimated (think positive times positive gives positive bias). Let’s verify this by generating \(s = 1000\) samples each with \(n = 200\) data points, run the regression of \(y\) on \(x_1\) using each sample, and plot the sampling distribution of \(\widehat{\beta}_1\). If we are correct, then the sampling distribution should be centered around some value to the right of \(\beta_1 = 2\) because \(\widehat{\beta}_1\) is overestimating this parameter.
# Number of samples
s <- 1000
# Number of data points to generate per sample
n <- 200
# Create a data table with empty values to store the estimates of beta_hat_1
dt_estimates <- data.table(beta_hat_1 = rep(NA, s))
# Intercept term when y is the outcome
beta_0 <- 1
# Coefficient on x_1 when y is the outcome
beta_1 <- 2
# Coefficient on x_2 when y is the outcome
beta_2 <- 3
# Intercept term when x_1 is the outcome
beta_tilde_0 <- 4
# Coefficient on x_2 when x_1 is the outcome
beta_tilde_1 <- 5
# We want to create i = 1, 2, ..., s samples
for (i in 1:s)
{
# Error term when y is the outcome
u <- rnorm(n, mean = 0, sd = 2)
# Error term when x_1 is the outcome
v <- rnorm(n, mean = 0, sd = 3)
# x_2
x_2 <- rnorm(n, mean = 0, sd = 1)
# x_1
x_1 <- beta_tilde_0 + beta_tilde_1 * x_2 + v
# Outcome
y <- beta_0 + beta_1 * x_1 + beta_2 * x_2 + u
# Create a data table
dt <- data.table(y = y, x_1 = x_1, x_2 = x_2)
# Run regression of y on x_1
reg <- lm(y ~ x_1, data = dt)
# Place the current slope estimate in dt_estimates
dt_estimates$beta_hat_1[i] <- reg$coefficients["x_1"]
}
# Plot the density of the slope estimates
ggplot(dt_estimates, aes(x = beta_hat_1)) +
geom_density(color = color_2, fill = color_2, alpha = 0.7) +
geom_vline(xintercept = beta_1, color = color_1, linetype = "dashed", size = 1) +
labs(title = expression(paste("Sampling Distribution of ", hat(beta)[1]))) +
theme(
panel.background = element_rect(fill = "black", color = "black"),
plot.background = element_rect(fill = "black", color = "black"),
panel.grid.major = element_line(color = "gray"),
panel.grid.minor = element_line(color = "gray"),
axis.text = element_text(color = color_1, size = 15),
axis.title = element_text(color = color_2, size = 25),
plot.title = element_text(hjust = 0.5, color = color_2, size = 30, face = "bold")
) +
xlab(expression(hat(beta)[1])) +
ylab("Density")
Indeed, the sampling distribuiton of \(\widehat{\beta}_1\) is centered to the right of the parameter it is estimating (\(\beta_1 = 2\)) implying the estimator is biased upward.
The true underlying model is \(y_i = \beta_0 + \beta_1 x_{i1} + \beta_2 x_{i2} + u_i = 1 + 2 x_{1i} + 3x_{2i} + u_i\), but suppose we don’t include \(x_2\) (perhaps we don’t have data on it which is often an issue when doing empirical work). We know \(x_2\) is positively related to \(y\) as indicated by its positive coefficient in the model \(\left(\beta_2 = 3 \right)\). Moreover, suppose \(x_{1i} = \widetilde{\beta}_0 + \widetilde{\beta}_1 x_{2i} + v_i = 4 - 5 x_{2i} + v_i\) so \(x_{2}\) is also negatively related to \(x_1\) as indicated by its negative coefficient in the model \(\left(\widetilde{\beta}_1 = -5 \right)\). Consequently, \(x_2\) is positively related with \(y\) and inversely related with \(x_1\) meaning when we estimate the model \(y_i = \beta_0 + \beta_1 x_{i1} + w_i\), \(\beta_1\) should be underestimated (think positive times negative gives negative bias). Let’s verify this by generating \(s = 1000\) samples each with \(n = 200\) data points, run the regression of \(y\) on \(x_1\) using each sample, and plot the sampling distribution of \(\widehat{\beta}_1\). If we are correct, then the sampling distribution should be centered around some value to the left of \(\beta_1 = 2\) because \(\widehat{\beta}_1\) is underestimating this parameter.
# Number of samples
s <- 1000
# Number of data points to generate per sample
n <- 200
# Create a data table with empty values to store the estimates of beta_hat_1
dt_estimates <- data.table(beta_hat_1 = rep(NA, s))
# Intercept term when y is the outcome
beta_0 <- 1
# Coefficient on x_1 when y is the outcome
beta_1 <- 2
# Coefficient on x_2 when y is the outcome
beta_2 <- 3
# Intercept term when x_1 is the outcome
beta_tilde_0 <- 4
# Coefficient on x_2 when x_1 is the outcome
beta_tilde_1 <- 5
# We want to create i = 1, 2, ..., s samples
for (i in 1:s)
{
# Error term when y is the outcome
u <- rnorm(n, mean = 0, sd = 2)
# Error term when x_1 is the outcome
v <- rnorm(n, mean = 0, sd = 3)
# x_2
x_2 <- rnorm(n, mean = 0, sd = 1)
# x_1
x_1 <- beta_tilde_0 - beta_tilde_1 * x_2 + v
# Outcome
y <- beta_0 + beta_1 * x_1 + beta_2 * x_2 + u
# Create a data table
dt <- data.table(y = y, x_1 = x_1, x_2 = x_2)
# Run regression of y on x_1
reg <- lm(y ~ x_1, data = dt)
# Place the current slope estimate in dt_estimates
dt_estimates$beta_hat_1[i] <- reg$coefficients["x_1"]
}
# Plot the density of the slope estimates
ggplot(dt_estimates, aes(x = beta_hat_1)) +
geom_density(color = color_2, fill = color_2, alpha = 0.7) +
geom_vline(xintercept = beta_1, color = color_1, linetype = "dashed", size = 1) +
labs(title = expression(paste("Sampling Distribution of ", hat(beta)[1]))) +
theme(
panel.background = element_rect(fill = "black", color = "black"),
plot.background = element_rect(fill = "black", color = "black"),
panel.grid.major = element_line(color = "gray"),
panel.grid.minor = element_line(color = "gray"),
axis.text = element_text(color = color_1, size = 15),
axis.title = element_text(color = color_2, size = 25),
plot.title = element_text(hjust = 0.5, color = color_2, size = 30, face = "bold")
) +
xlab(expression(hat(beta)[1])) +
ylab("Density")
Indeed, the sampling distribuiton of \(\widehat{\beta}_1\) is centered to the left of the parameter it is estimating (\(\beta_1 = 2\)) implying the estimator is biased downward.
Let’s construct a scenario where the outcome is measured with error. The true model will be \(y_i = \beta_0 + \beta_1 x_{i1} + \beta_2 x_{i2} + u_i = 0.5 + x_{i1} + 2x_{i2} + u_i\), but we observe \(y_i^* = y_i + e_i\) where \(e_i\) is the measurement error associated with agent \(i\). Thus, we estimate \(y_i = \beta_0 + \beta_1 x_{i1} + \beta_2 x_{i2} + u_i + e_i\). Assuming the measurement error is not correlated with the covariates, we know the estimates of \(\boldsymbol{\beta}\) will be unbiased, but will have a higher variance relative to the case of no measurement error. Let’s verify this by generating \(s = 1000\) samples each with \(n = 200\) data points, run the regression of \(y\) on \(x_1\) and \(x_2\) for each sample, run a regression of \(y^*\) on \(x_1\) and \(x_2\) for each sample, and plot the sampling distribution of \(\widehat{\beta}_1\). If we are correct, then the sampling distribution should be centered around the true parameter \(\beta_1 = 1\), but the variance of this regressor in the case of regressing the mismeasured outcome on the covariates should be larger.
# Number of samples
s <- 1000
# Number of data points to generate per sample
n <- 200
# Create a data table with empty values to store the estimates of beta_hat_1
dt_estimates <- data.table(beta_hat_1_without_e = rep(NA, s),
beta_hat_1_with_e = rep(NA, s))
# Intercept term when y is the outcome
beta_0 <- 0.5
# Coefficient on x_1 when y is the outcome
beta_1 <- 1
# Coefficient on x_2 when y is the outcome
beta_2 <- 2
# We want to create i = 1, 2, ..., s samples
for (i in 1:s)
{
# Error term when y is the outcome
u <- rnorm(n, mean = 0, sd = 2)
# Measurement error
e <- rnorm(n, mean = 0, sd = 3)
# x_1
x_1 <- rnorm(n, mean = 0, sd = 1)
# x_2
x_2 <- rnorm(n, mean = 0, sd = 2)
# Outcome
y <- beta_0 + beta_1 * x_1 + beta_2 * x_2 + u
# Outcome mismeasured
y_star <- y + e
# Create a data table of y without measurement error
dt_without_e <- data.table(y = y, x_1 = x_1, x_2 = x_2)
# Run regression of y on x_1 and x_2
reg_without_e <- lm(y ~ x_1 + x_2, data = dt_without_e)
# Place the current slope estimate of beta_1 in dt_estimates
dt_estimates$beta_hat_1_without_e[i] <- reg_without_e$coefficients["x_1"]
# Create a data table of y with measurement error
dt_with_e <- data.table(y_star = y_star, x_1 = x_1, x_2 = x_2)
# Run regression of y on x_1 and x_2
reg_with_e <- lm(y_star ~ x_1 + x_2, data = dt_with_e)
# Place the current slope estimate of beta_1 in dt_estimates
dt_estimates$beta_hat_1_with_e[i] <- reg_with_e$coefficients["x_1"]
}
# Plot the density of the slope estimate on x_1 when y is measured correctly
plot_beta_hat_1_without_e <- ggplot(dt_estimates, aes(x = beta_hat_1_without_e)) +
geom_density(color = color_2, fill = color_2, alpha = 0.7) +
geom_vline(xintercept = beta_1, color = color_1, linetype = "dashed", size = 1) +
labs(title = expression(paste("No ME in Outcome"))) +
theme(
panel.background = element_rect(fill = "black", color = "black"),
plot.background = element_rect(fill = "black", color = "black"),
panel.grid.major = element_line(color = "gray"),
panel.grid.minor = element_line(color = "gray"),
axis.text = element_text(color = color_1, size = 8),
axis.title = element_text(color = color_2, size = 10),
plot.title = element_text(hjust = 0.5, color = color_2, size = 15, face = "bold")
) +
xlab(expression(hat(beta)[1])) +
xlim(range(dt_estimates$beta_hat_1_without_e, dt_estimates$beta_hat_1_with_e)) +
ylab("Density")
# Plot the density of the slope estimate on x_1 when y is measured incorrectly
plot_beta_hat_1_with_e <- ggplot(dt_estimates, aes(x = beta_hat_1_with_e)) +
geom_density(color = color_2, fill = color_2, alpha = 0.7) +
geom_vline(xintercept = beta_1, color = color_1, linetype = "dashed", size = 1) +
labs(title = expression(paste("ME in Outcome"))) +
theme(
panel.background = element_rect(fill = "black", color = "black"),
plot.background = element_rect(fill = "black", color = "black"),
panel.grid.major = element_line(color = "gray"),
panel.grid.minor = element_line(color = "gray"),
axis.text = element_text(color = color_1, size = 8),
axis.title = element_text(color = color_2, size = 10),
plot.title = element_text(hjust = 0.5, color = color_2, size = 15, face = "bold")
) +
xlab(expression(hat(beta)[1])) +
xlim(range(dt_estimates$beta_hat_1_without_e, dt_estimates$beta_hat_1_with_e)) +
ylab("Density")
# Arrange plots in a single figure
grid.arrange(plot_beta_hat_1_without_e, plot_beta_hat_1_with_e, nrow = 1)
As we can see, our estimator looks unbiased like we would expect. However, when there is measurement error (ME) in the outcome (right side image), the sampling distribution of our estimator is more spread out indicating the variance of estimator is higher. This also holds for each estimator, not only \(\widehat{\beta}_1\).
Let’s construct a scenario where the \(x_1\) regressor is measured with error. The true model will be \(y_i = \beta_0 + \beta_1 x_{i1} + \beta_2 x_{i2} + u_i = 0.5 + x_{i1} + 2x_{i2} + u_i\), but we observe \(x_{i1}^* = x_i + e_i\) where \(e_i\) is the measurement error associated with agent \(i\). This implies \(x_{i1} = x_{i1}^* - e_i\). Thus, we estimate \(y_i = \beta_0 + \beta_1 \left( x_{i1}^* - e_i \right) + \beta_2 x_{i2} + u_i\). This will lead to a biased estimate of \(\beta_1\). We’ll also increase the standard deviation of \(e_i\) from 0.5 to 2 to show that this higher variance, keeping the variance of \(u_i\) fixed, leads to attenuation bias (downward bias increases as the variance of the measurement error increases relative to the variance of the original error \(u_i\)). Let’s verify this by generating \(s = 1000\) samples each with \(n = 200\) data points, run the regression of \(y\) on \(x_{i1}\) and \(x_2\) for each sample, and plot the sampling distribution of \(\widehat{\beta}_1\). If we are correct, then the sampling distribution should be not be centered around the true parameter \(\beta_1 = 1\) and as the variance of the measurement error increases our estimator will be attenuated towards zero.
# Number of samples
s <- 1000
# Number of data points to generate per sample
n <- 200
# Create a data table with empty values to store the estimates of beta_hat_1
dt_estimates <- data.table(beta_hat_1_e_low = rep(NA, s),
beta_hat_1_e_high = rep(NA, s))
# Intercept term when y is the outcome
beta_0 <- 0.5
# Coefficient on x_1 when y is the outcome
beta_1 <- 1
# Coefficient on x_2 when y is the outcome
beta_2 <- 2
# We want to create i = 1, 2, ..., s samples
for (i in 1:s)
{
# Error term when y is the outcome
u <- rnorm(n, mean = 0, sd = 2)
# Measurement error with lower variance
e_low <- rnorm(n, mean = 0, sd = 0.5)
# Measurement error with higher variance
e_high <- rnorm(n, mean = 0, sd = 2)
# x_1
x_1 <- rnorm(n, mean = 0, sd = 1)
# x_1 mismeasured with lower variance e
x_1_star_low <- x_1 + e_low
# x_1 mismeasured with higher variance e
x_1_star_high <- x_1 + e_high
# x_2
x_2 <- rnorm(n, mean = 0, sd = 2)
# Outcome
y <- beta_0 + beta_1 * x_1 + beta_2 * x_2 + u
# Create a data table of y with lower variance measurement error
dt <- data.table(y = y, x_1_star = x_1_star_low, x_2 = x_2)
# Run regression of y on x_1_star and x_2
reg <- lm(y ~ x_1_star + x_2, data = dt)
# Place the current slope estimate of beta_1 in dt_estimates
dt_estimates$beta_hat_1_e_low[i] <- reg$coefficients["x_1_star"]
# Create a data table of y with higher variance measurement error
dt <- data.table(y = y, x_1_star = x_1_star_high, x_2 = x_2)
# Run regression of y on x_1_star and x_2
reg <- lm(y ~ x_1_star + x_2, data = dt)
# Place the current slope estimate of beta_1 in dt_estimates
dt_estimates$beta_hat_1_e_high[i] <- reg$coefficients["x_1_star"]
}
# Plot the density of the slope estimate on x_1 when its measurement error has lower variance
plot_beta_hat_1_e_low <- ggplot(dt_estimates, aes(x = beta_hat_1_e_low)) +
geom_density(color = color_2, fill = color_2, alpha = 0.7) +
geom_vline(xintercept = beta_1, color = color_1, linetype = "dashed", size = 1) +
labs(title = expression(paste("Low Variance ME in x_1"))) +
theme(
panel.background = element_rect(fill = "black", color = "black"),
plot.background = element_rect(fill = "black", color = "black"),
panel.grid.major = element_line(color = "gray"),
panel.grid.minor = element_line(color = "gray"),
axis.text = element_text(color = color_1, size = 8),
axis.title = element_text(color = color_2, size = 10),
plot.title = element_text(hjust = 0.5, color = color_2, size = 15, face = "bold")
) +
xlab(expression(hat(beta)[1])) +
xlim(range(dt_estimates$beta_hat_1_e_low, dt_estimates$beta_hat_1_e_high)) +
ylab("Density")
# Plot the density of the slope estimate on when its measurement error has higher variance
plot_beta_hat_1_e_high <- ggplot(dt_estimates, aes(x = beta_hat_1_e_high)) +
geom_density(color = color_2, fill = color_2, alpha = 0.7) +
geom_vline(xintercept = beta_1, color = color_1, linetype = "dashed", size = 1) +
labs(title = expression(paste("High Variance ME in x_1"))) +
theme(
panel.background = element_rect(fill = "black", color = "black"),
plot.background = element_rect(fill = "black", color = "black"),
panel.grid.major = element_line(color = "gray"),
panel.grid.minor = element_line(color = "gray"),
axis.text = element_text(color = color_1, size = 8),
axis.title = element_text(color = color_2, size = 10),
plot.title = element_text(hjust = 0.5, color = color_2, size = 15, face = "bold")
) +
xlab(expression(hat(beta)[1])) +
xlim(range(dt_estimates$beta_hat_1_e_low, dt_estimates$beta_hat_1_e_high)) +
ylab("Density")
# Arrange plots in a single figure
grid.arrange(plot_beta_hat_1_e_low, plot_beta_hat_1_e_high, nrow = 1)
In either case of the measurement error having low or high variance, our estimator is biased. However, this bias increases as the variance in the measurement error increases giving rise to what is commonly referred to as attenuation bias. This means our parameter estimate \(\widehat{\beta}_1 \to 0\) as the variance of our measurement error increases. In econometrics, there is a common saying that if you must have biased estimators, it is preferable to have attenuation bias since it results in underestimating the causal effect rather than overestimating it.