# Clear environment, plot pane, and console
rm(list = ls())
graphics.off()
cat("\014")
# Set working directory
setwd("C:/Users/wbras/OneDrive/Desktop/UA/Fall_2024/ECON_418-518/ECON_418-518_Lecture_Slides/ECON_418-518_Lecture_Slides_E/ECON_418-518_E_04_Inference/ECON_418-518_E_04_Inference_Code")
# Install pacman
if (!require(pacman)) install.packages("pacman")
# Load packages
pacman::p_load(data.table, ggplot2)
# Colors used for plot (https://www.rapidtables.com/web/color/RGB_Color.html)
color_1 <- "#00FF00"
color_2 <- "#ADD8E6"
# Disable scientific notation
options(scipen = 999)
The shape of t-distribution is determined by the degrees of freedom. Let’s see what this shape looks like.
# Create data table for T-distributions
df_t <- c(2, 7, 100) # Degrees of freedom for t distribution
x_t <- seq(-5, 5, length.out = 100)
dt_t <- data.table(
x = rep(x_t, times = 3),
y = c(dt(x_t, df_t[1]), dt(x_t, df_t[2]), dt(x_t, df_t[3])),
group = factor(rep(1:3, each = 100))
)
# Create the t-distribution plot
ggplot(dt_t, aes(x, y, color = group)) +
geom_line(linewidth = 1) +
labs(
title = "t-Distributions",
color = "D.o.F." # Legend title
) +
scale_color_manual(
values = c("#FFA07A", "#20B2AA", "#778899"),
labels = c("2", "7", "100")
) +
theme(
panel.background = element_rect(fill = "black", color = "black"),
plot.background = element_rect(fill = "black", color = "black"),
panel.grid.major = element_line(color = "gray"),
panel.grid.minor = element_line(color = "gray"),
axis.text = element_text(color = color_1, size = 15),
axis.title = element_text(color = color_2, size = 25),
plot.title = element_text(hjust = 0.5, color = color_2, size = 30, face = "bold"),
plot.subtitle = element_text(hjust = 0.5, color = color_1, size = 25),
legend.background = element_rect(fill = "black"),
legend.text = element_text(color = color_1, size = 15),
legend.title = element_text(color = color_2, size = 20)
) +
expand_limits(y = 0) +
xlab("x") +
ylab("Density")
As you can see, this distribution is centered around zero for any degrees of freedom. Moreover, the tails get thinner as the degrees of freedom rises indicating the t-distribution converges towards the standard normal as the degrees of freedom tends to infinity.
The shape of F-distribution is determined by both its numerator and denonominator degrees of freedom. Let’s see what this shape looks like.
# Create data table for F-distributions
df_f1 <- c(0.5, 10) # Numerator degrees of freedom for F distribution
df_f2 <- c(0.5, 20) # Denominator degrees of freedom for F distribution
x_f <- seq(0, 5, length.out = 100)
dt_f <- data.table(
x = rep(x_f, times = 4),
y = c(df(x_f, df_f1[1], df_f2[1]), df(x_f, df_f1[2], df_f2[2]), df(x_f, df_f1[1], df_f2[2]), df(x_f, df_f1[2], df_f2[1])),
group = factor(rep(1:4, each = 100))
)
# Create the F distribution plot
ggplot(dt_f, aes(x, y, color = group)) +
geom_line(linewidth = 1) +
labs(
title = "F-Distributions",
color = "D.o.F." # Legend title
) +
scale_color_manual(
values = c("#FF6347", "#32CD32", "#8A2BE2", "#FFD700"),
labels = c("2, 10", "10, 20", "2, 20", "10, 10")
) +
theme(
panel.background = element_rect(fill = "black", color = "black"),
plot.background = element_rect(fill = "black", color = "black"),
panel.grid.major = element_line(color = "gray"),
panel.grid.minor = element_line(color = "gray"),
axis.text = element_text(color = color_1, size = 15),
axis.title = element_text(color = color_2, size = 25),
plot.title = element_text(hjust = 0.5, color = color_2, size = 30, face = "bold"),
plot.subtitle = element_text(hjust = 0.5, color = color_1, size = 25),
legend.background = element_rect(fill = "black"),
legend.text = element_text(color = color_1, size = 15),
legend.title = element_text(color = color_2, size = 20)
) +
expand_limits(y = 0) +
xlab("x") +
ylab("Density")
We can see that the F-distribution is one sided; it only has one tail on the right and no tail on the left. This is similar to the \(\chi^2\) distribution from which the F-distribution comes from. In fact, the F-distribution converges to the \(\chi^2\) distribution as the F-distribution’s denominator degrees of freedom approaches infinity.
wooldridge
Dataset# Load in gpa2 dataset from wooldridge as a data table
dt <- data.table(wooldridge::econmath)
# Show the first few rows of the data
head(dt)
# Show the last few rows of the data
tail(dt)
# Only keep data on colgpa, hsgpa, act, and study
dt <- dt[, .(colgpa, hsgpa, act, study)]
# Show many NA values there are in the data
sum(is.na(dt))
## [1] 42
# Drop rows with any NA values
dt <- na.omit(dt)
# Show many NA values there are in the data now
sum(is.na(dt))
## [1] 0
# Get summary statistics
summary(dt)
## colgpa hsgpa act study
## Min. :0.875 Min. :2.355 Min. :13.00 Min. : 0.00
## 1st Qu.:2.431 1st Qu.:3.114 1st Qu.:21.00 1st Qu.: 8.00
## Median :2.806 Median :3.333 Median :23.00 Median :12.00
## Mean :2.811 Mean :3.345 Mean :23.12 Mean :13.84
## 3rd Qu.:3.207 3rd Qu.:3.589 3rd Qu.:25.00 3rd Qu.:17.88
## Max. :4.000 Max. :4.260 Max. :33.00 Max. :50.00
Let’s conduct some T-Tests. We’ll first construct the T-statistics
based on the report produced by lm()
. Then, we’ll do this
for T-Test One from scratch. The T-statistic produced in
R
’s regression output is given by \(T = \frac{\widehat{\beta}}{\text{se}\left(
\widehat{\beta} \right)}\) meaning it tests \(H_0: \beta = 0\) versus \(H_A: \beta \neq 0\).
Let’s focus on the case of a two tailed test of \(H_0: \beta_j = 0\) versus \(H_A: \beta_j \neq 0\). Why should we reject the null (be confident that \(\beta_j\) is different from zero) when \(|T| > t_{1 - \frac{\alpha}{2}, n - k}\)?
Let’s think about exactly what this T-statistic is. The T-statistic for the case of our specific hypothesis test is \(T = \frac{\widehat{\beta}}{\text{se}\left( \widehat{\beta} \right)}\). Thus, a large \(|T|\) either means \(\widehat{\beta}_j\) is large in absolute value (far away from zero) or \(\text{se}\left( \widehat{\beta}_j \right)\) is small. When \(\widehat{\beta}_j\) is large in absolute value, we should be confident that \(\beta_j\) is different from zero because its estimate is. When, \(\text{se}\left( \widehat{\beta}_j \right)\) is small, that indicates we are more certain \(\beta_j\) is different from zero (provided that \(\widehat{\beta}_j \neq 0\)) because a smaller standard error indicates more certanity surrounding the estimate.
Now, let’s think about exactly what this critical value is. The critical value for a two-tailed T-test is the value produced by the inverse CDF of the t-distribution with \(n-k\) degrees of freedom when evaluated at \(1 - \frac{\alpha}{2}\). Think about it this way: The CDF takes a number as input and returns the probability that a randomly selected value from the distribution is less than or equal to that input number while the inverse CDF takes a probability as input and returns the corresponding value such that this value is greater than or equal to that proportion of the distribution. Mathematically, the critical value is defined as \[\begin{align} t_{1 - \frac{\alpha}{2}, n - k} &= F_{T_{n-k}}^{-1}\left(1 - \frac{\alpha}{2} \right). \end{align}\] where \(F_{T_{n-k}}^{-1}\) is the inverse CDF of the t-distribution with \(n-k\) degrees of freedom (takes in a probability and outputs a number rather than the other way around).
When \(t_{1 - \frac{\alpha}{2}, n - k}\) is large our hypothesis test is harder to reject. This value is larger as \(\alpha\) decreases which is why a test at \(\alpha = 0.01\) is harder to reject than a test at \(\alpha = 0.1\). A large \(t_{1 - \frac{\alpha}{2}, n - k}\) means this value is larger than most T-distributed random variables with \(n-k\) degrees of freedom. So, when \(|T| > t_{1 - \frac{\alpha}{2}, n - k}\) holds true this means \(|T|\) must be pretty large which implies either, as we discussed above, \(\widehat{\beta}_j\) is large in absolute value or \(\text{se}\left( \widehat{\beta}_j \right)\) is small. In either case, this gives evidence the null is false.
lm()
Let’s use the wooldridge
dataset econmath
to analyze determinants of college GPA, colgpa
. Here are
the model’s covariates:
hsgpa
act
study
sq_study
. The
rationale is that at some point studying so much may start to actually
hurt your performance.So, we are estimating the model \[\begin{align} colgpa_i = \beta_0 + \beta_1 hsgpa + \beta_2 act + \beta_3 study + \beta_4 study^2 + u_i. \end{align}\]
# Get and show the number of observations
n <- nrow(dt)
cat("The number of observation is: ", n)
## The number of observation is: 814
# Number of estimated parameters
k <- 4
# Create variable for the square of total hours per week spent studying
dt[, study_sq := study^2]
# Run regression of colgpa on covariates
reg <- lm(colgpa ~ hsgpa + act + study + study_sq, data = dt)
# Print the summary of reg
summary(reg)
##
## Call:
## lm(formula = colgpa ~ hsgpa + act + study + study_sq, data = dt)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.80612 -0.30055 0.02035 0.33471 1.18006
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.023759108 0.174500199 -0.136 0.892
## hsgpa 0.648049922 0.053154305 12.192 < 0.0000000000000002 ***
## act 0.026520295 0.005488096 4.832 0.00000161 ***
## study 0.003921578 0.006589506 0.595 0.552
## study_sq -0.000002247 0.000168500 -0.013 0.989
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4653 on 809 degrees of freedom
## Multiple R-squared: 0.2591, Adjusted R-squared: 0.2555
## F-statistic: 70.73 on 4 and 809 DF, p-value: < 0.00000000000000022
lm()
Let’s conduct the hypothesis test of \(H_0:
\beta_2 = 0\) versus \(H_A: \beta_2
\neq 0\) at the \(\alpha =
0.05\) level. The T-statistic for this test is the one reported
by R
’s summary()
function in the column
t value
. Since we are interested in \(\beta_2\), the T-statistic for our test is
\(4.832\). We need to find the
t-critical value \(t_{1 - \frac{\alpha}{2}, n
- k} = t_{0.975, 809}\). Now, we use R
’s
qt()
function to get the inverse CDF of the t-distribution.
This inverse CDF will get the t-critical value for 809 degrees of
freedom giving exactly 0.975 area to the left of it.
# Get the t-critical value
t_crit <- qt(0.975, 809)
# Show the t-critical value
t_crit
## [1] 1.962901
Since \(|T| = 4.832 > 1.96 = t_{0.975, 809}\), we reject the null hypothesis and conclude that \(\beta_2\) is different from zero at the \(\alpha = 0.05\) significance level (95% confidence level).
lm()
Let’s conduct the hypothesis test of \(H_0:
\beta_1 = 1\) versus \(H_A: \beta_1
\neq 1\) at the \(\alpha = 0.1\)
level. The T-statistic for this test is not reported by R
’s
summary function so we must construct it ourselves. The T-statistic for
this test is \[\begin{align}
T
&= \frac{\widehat{\beta}_1 - 1}{\text{se} \left[ \widehat{\beta}_1
\right]} \\
&= \frac{0.648 - 1}{0.053} \\
&= -6.642
\end{align}\]
Now, we need to find the t-critical value \(t_{1 - \frac{\alpha}{2}, n - k} = t_{0.95,
809}\). We use R
’s qt()
function to get
the inverse CDF of the t-distribution. This inverse CDF will get the
t-critical value for 809 degrees of freedom giving exactly 0.95 area to
the left of it.
# Get the t-critical value
t_crit <- qt(0.95, 809)
# Show the t-critical value
t_crit
## [1] 1.646739
Since \(|T| = 6.642 > 1.64 = t_{0.95, 809}\), we reject the null hypothesis and conclude that \(\beta_2\) is different from one at the \(\alpha = 0.1\) significance level (90% confidence level).
lm()
Let’s conduct the hypothesis test of \(H_0:
\beta_1 \leq 0.5\) versus \(H_A:
\beta_1 > 0.5\) at the \(\alpha =
0.05\) level. This is a right tailed test and the T-statistic for
this test is not reported by R
’s summary function so we
must construct it ourselves. The T-statistic for this test is \[\begin{align}
T
&= \frac{\widehat{\beta}_1 - 0.5}{\text{se} \left[
\widehat{\beta}_1 \right]} \\
&= \frac{0.648 - 0.5}{0.053} \\
&= 2.792
\end{align}\]
Now, we need to find the t-critical value \(t_{1 - \alpha, n - k} = t_{0.95, 809}\). We
use R
’s qt()
function to get the inverse CDF
of the t-distribution. This inverse CDF will get the t-critical value
for 809 degrees of freedom giving exactly 0.95 area to the left of
it.
# Get the t-critical value
t_crit <- qt(0.95, 809)
# Show the t-critical value
t_crit
## [1] 1.646739
Since \(T = 2.792 > 1.64 = t_{0.95, 809}\), we reject the null hypothesis and conclude that \(\beta_1\) is greater than one-half at the \(\alpha = 0.05\) significance level (95% confidence level).
lm()
Let’s conduct the hypothesis test of \(H_0:
\beta_2 \geq 0.03\) versus \(H_A:
\beta_2 < 0.03\) at the \(\alpha =
0.01\) level. This is a left tailed test and the T-statistic for
this test is not reported by R
’s summary function so we
must construct it ourselves. The T-statistic for this test is \[\begin{align}
T
&= \frac{\widehat{\beta}_2 - 0.03}{\text{se} \left[
\widehat{\beta}_1 \right]} \\
&= \frac{0.027 - 0.03}{0.0054} \\
&= -0.555
\end{align}\]
Now, we need to find the t-critical value \(t_{\alpha, n - k} = t_{0.01, 809}\). We use
R
’s qt()
function to get the inverse CDF of
the t-distribution. This inverse CDF will get the t-critical value for
809 degrees of freedom giving exactly 0.01 area to the left of it.
# Get the t-critical value
t_crit <- qt(0.01, 809)
# Show the t-critical value
t_crit
## [1] -2.330966
Since \(T = -0.555 > -2.33 = t_{0.01, 810}\), we fail to reject the null hypothesis and cannot conclude \(\beta_2 < 0.03\) at the \(\alpha = 0.01\) significance level (99% confidence level).
Let’s create the test statistics produced by R
’s
summary()
function by hand. The summary()
table can be derived from left to right. That is, we first obtain the
parameter estimates by computing \((X^\prime
X)^{-1} X^\prime \boldsymbol{y}\). Then, we obtain the standard
errors, use those standard errors to get the T-statistics, and then use
those T-statistics to get the p-values.
# Make our y vector
y <- dt[, colgpa]
# Make a vector of 1's to add as the first column of our X matrix for the intercept
Intercept <- rep(1, nrow(dt))
# Now lets construct our X matrix
X <- as.matrix(cbind(Intercept, dt[, .(hsgpa, act, study, study_sq)]))
# Show the first 5 rows and all columns of X
X[1:5, ]
## Intercept hsgpa act study study_sq
## [1,] 1 3.355 27 10.0 100.00
## [2,] 1 3.219 24 22.5 506.25
## [3,] 1 3.306 21 12.0 144.00
## [4,] 1 3.977 31 40.0 1600.00
## [5,] 1 3.890 32 15.0 225.00
# Transpose of X
Xprime <- t(X)
# Compute the OLS solution for beta_hat
beta_hat <- solve(Xprime %*% X) %*% Xprime %*% y
# Show beta_hat
beta_hat
## [,1]
## Intercept -0.023759107645
## hsgpa 0.648049922299
## act 0.026520294947
## study 0.003921578066
## study_sq -0.000002246773
Notice this is identical to that produced by R
’s
summary()
function like we would expect.
# Number of observations
n <- nrow(X)
# Number of estimated parameters (including intercept)
k <- ncol(X)
# Obtain the estimated residuals
u_hat <- y - X %*% beta_hat
# Obtain the variance of u_hat (n - k accounts for the degree of freedom correction)
sigma_hat_sq <- sum(u_hat^2) / (n - k)
# Get inverse of X_prime %*% X
Xprime_X_inverse <- solve(Xprime %*% X)
# Obtain the estimated variance covariance matrix
var_cov <- sigma_hat_sq * Xprime_X_inverse
# Get the diagonal elements of the estimated variance-covaraince matrix (elements along diagonal are the variance of each parameter)
beta_hat_var <- diag(var_cov)
# Take the square root of the estimated variance of the estimated parameters to get the standard error
beta_hat_se <- sqrt(beta_hat_var)
# Show the standard errors
beta_hat_se
## Intercept hsgpa act study study_sq
## 0.1745001987 0.0531543048 0.0054880963 0.0065895064 0.0001685002
Notice this is identical to that produced by R
’s
summary()
function like we would expect.
We use the created standard errors to generate the test statistics. Please note, these are the test statistics associated with the test of \(H_0: \beta_j = 0\) versus \(H_A: \beta_j \neq 0\).
# Divide beta_hat by its standard error to get its T-statistic
beta_hat_t <- beta_hat / beta_hat_se
# Show the T-statistics
beta_hat_t
## [,1]
## Intercept -0.13615519
## hsgpa 12.19186150
## act 4.83233046
## study 0.59512471
## study_sq -0.01333395
Notice this is identical to that produced by R
’s
summary()
function like we would expect.
The p-value associated with our OLS estimators is given by \[\begin{align} p &= 2 * \mathbb{P} \left( T_{n - k} > |T| \right) \\ &= 2 * \left(1 - F_{T_{n-k}} \left( |T| \right) \right) \end{align}\] where \(F_{T_{n-k}}\) is the CDF of the t-distribution with \(n-k\) degrees of freedom, \(T_{n-k}\) is any t-distributed random variable with \(n-k\) degrees of freedom, and \(T\) is our test statistic which has a t-distribution with \(n-k\) degrees of freedom.
What exactly is the intuition of the p-value? Well, the ones given in
the output of the summary()
function are associated with
hypothesis tests of \(H_0: \beta_j =
0\) versus \(H_A: \beta_j \neq
0\). So, if our p-value is small that means \(2 * \mathbb{P} \left( T_{n - k} > |T|
\right)\) is small which is to say \(\mathbb{P} \left( T_{n - k} > |T|
\right)\) is small. Thus, \(|T|\) is likely pretty large.
\(T = \frac{\widehat{\beta}_j}{\text{se}\left( \widehat{\beta}_j \right)}\) so a large \(|T|\) either means \(\widehat{\beta}_j\) is large in absolute value (far away from zero) or \(\text{se}\left( \widehat{\beta}_j \right)\) is small. When \(\widehat{\beta}_j\) is large in absolute value, we should be confident that \(\beta_j\) is different from zero. When, \(\text{se}\left( \widehat{\beta}_j \right)\) is small, that indicates we are more certain \(\beta_j\) is different from zero (provided that \(\widehat{\beta}_j \neq 0\)). Either case produces a larger test statistic \(T\) implying a smaller p-value \(p\).
We can construct one-sided p-values to conduct either left or right-sided tests, but I’ve never seen this done in practice so we won’t worry about it here.
lm()
p-values are derived from the T-statistics. Let’s recall out regression output:
# Print the summary of reg
summary(reg)
##
## Call:
## lm(formula = colgpa ~ hsgpa + act + study + study_sq, data = dt)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.80612 -0.30055 0.02035 0.33471 1.18006
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.023759108 0.174500199 -0.136 0.892
## hsgpa 0.648049922 0.053154305 12.192 < 0.0000000000000002 ***
## act 0.026520295 0.005488096 4.832 0.00000161 ***
## study 0.003921578 0.006589506 0.595 0.552
## study_sq -0.000002247 0.000168500 -0.013 0.989
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4653 on 809 degrees of freedom
## Multiple R-squared: 0.2591, Adjusted R-squared: 0.2555
## F-statistic: 70.73 on 4 and 809 DF, p-value: < 0.00000000000000022
Notice the ***
on the right side associated with the
estimates of hsgpa
and act
as well as the
Signif. codes
. A triple ***
means we reject
the null hypothesis of \(\beta_j = 0\)
at the \(\alpha = 0.001\) significance
level. A double **
means we reject the null hypothesis of
\(\beta_j = 0\) at the \(\alpha = 0.01\) significance level. A
single *
means we reject the null hypothesis of \(\beta_j = 0\) at the \(\alpha = 0.05\) significance level. A
single .
means we reject the null hypothesis of \(\beta_j = 0\) at the \(\alpha = 0.1\) significance level.
We can construct these p-values using R
’s
pt()
function. This corresponds to the CDF of the
t-distribution. Let’s get the p-value associated with the
act
estimate.
# Use pt (CDF of T-distribution) to get the p-value
beta_hat_p <- 2 * pt(abs(4.832), n - k, lower.tail = FALSE)
# Show the p-values
beta_hat_p
## [1] 0.000001617101
We could also have ran the following code and gotten the same result.
# Use pt (CDF of T-distribution) to get the p-value
beta_hat_p <- 2 * (1 - pt(abs(4.832), n - k, lower.tail = TRUE))
# Show the p-values
beta_hat_p
## [1] 0.000001617101
This is because
2 * pt(abs(4.832), n - k, lower.tail = FALSE)
corresponds
to \(2 * \mathbb{P} \left( T_{n - k} > |T|
\right)\) so we assign lower.tail = FALSE
because we
want the probability that any \(T_{n-k}\) random variable is greater than
the absolute value of test statistic (think greater than so we want the
upper tail, i.e., lower.tail = FALSE
). Moreover,
2 * (1 - pt(abs(4.832), n - k, lower.tail = TRUE))
corresponds to \(2 * \left(1 - F_{T_{n-k}}
\left(|T| \right) \right)\) so we assign
lower.tail = TRUE
because we want the probability that any
\(T_{n-k}\) random variable is less
than or equal to the absolute value of test statistic (think less than
so we want the lower tail, i.e., lower.tail = TRUE
).
We use the created test statistics to generate the p-values. Please note, these are the p-values associated with the test of \(H_0: \beta_j = 0\) versus \(H_A: \beta_j \neq 0\).
# Use pt (CDF of t-distribution) to get the p-value
beta_hat_p <- 2 * pt(abs(beta_hat_t), n - k, lower.tail = FALSE)
# Show the p-values
beta_hat_p
## [,1]
## Intercept 0.891732471362761458522072643972933292
## hsgpa 0.000000000000000000000000000000165226
## act 0.000001614493989861939950408026289708
## study 0.551926539161966323021601965592708439
## study_sq 0.989364651553847451737055962439626455
Notice this is identical to that produced by R
’s
summary()
function like we would expect.
Confidence intervals are extremely important in econometrics and applied statistics. I can guarantee you they will show up in your applied work and I can almost guarantee you they will show up on your exam 😊.
A \(100(1 - \alpha)\%\) confidence
interval for \(\beta_j\) is given by
\[\begin{align}
\left[ \widehat{\beta}_j - t_{1 - \frac{\alpha}{2}, n - k} \cdot
\text{se} \left(\widehat{\beta}_j \right), \, \widehat{\beta}_j + t_{1
- \frac{\alpha}{2}, n - k} \cdot \text{se}\left(\widehat{\beta}_j
\right) \right]
\end{align}\] If you are wondering why this confidence interval
is the way it is, I proved this result in the
ECON_418-518_E_04_Inference
lecture slides.
The intuition of a confidence interval is pretty straightforward.
We can construct one-sided confidence intervals to conduct either left or right-sided tests, but I’ve never seen this done in practice so we won’t worry about it here.
lm()
One poor aspect of R
’s summary function is that it
doesn’t directly give us confidence intervals for our parameters.
However, we can easily obtain a confidence interval for each parameter
using R
’s confint()
function.
# Show the 90% confidence interval for each parameter
confint(reg, level = 0.90)
## 5 % 95 %
## (Intercept) -0.3111154463 0.2635972310
## hsgpa 0.5605186385 0.7355812061
## act 0.0174828310 0.0355577589
## study -0.0069296213 0.0147727774
## study_sq -0.0002797227 0.0002752291
# Show the 95% confidence interval for each parameter
confint(reg, level = 0.95)
## 2.5 % 97.5 %
## (Intercept) -0.3662856615 0.3187674462
## hsgpa 0.5437133028 0.7523865418
## act 0.0157477071 0.0372928828
## study -0.0090129684 0.0168561245
## study_sq -0.0003329959 0.0003285023
# Show the 99% confidence interval for each paramter
confint(reg, level = 0.99)
## 0.5 % 99.5 %
## (Intercept) -0.4743046851 0.4267864698
## hsgpa 0.5108097522 0.7852900924
## act 0.0123504686 0.0406901213
## study -0.0130920013 0.0209351574
## study_sq -0.0004373008 0.0004328072
Notice these confidence intervals get larger as \(\alpha\) increases. This is because as
\(\alpha\) decreases \(t_{1 - \frac{\alpha}{2}, n - k}\) gets
larger (look under the T-Tests Intuition
section for why
this is) which implies the number \(t_{1 -
\frac{\alpha}{2}, n - k} \cdot \text{se}\left(\widehat{\beta}_j
\right)\) being subtracted and added to \(\widehat{\beta}_j\) is larger. The
intuition is that if we want to make a claim about a parameter value
with a higher confidence level, then we widen the range of possible
values it can lie in to buffer our claim.
Let’s manually construct a 95% confidence interval for
act
and carry out the hypothesis test \(H_0: \beta_2 = 0\) versus \(H_A: \beta_2 \neq 0\).
# Let's get the T-critical value for a 95% confidence level test
t_crit <- qt(0.975, df = n - k)
# Create a vector that holds the left and right endpoints of the confidence interval
ci <- c(beta_hat[3] - t_crit * beta_hat_se[3], beta_hat[3] + t_crit * beta_hat_se[3])
# Show the vector
ci
## act act
## 0.01574771 0.03729288
Notice this is identical to that produced by R
’s
confint()
function. Moreover, zero is not contained in this
interval so we reject the null hypothesis of \(\beta_j = 0\) and conclude that \(\beta_j \neq 0\) with 95% confidence.
F-Tests are used to conduct tests regarding joint significance. Suppose we wish to test \(H_0: \beta_3 = \beta_4 = 0\) versus \(H_A: \beta_3 \neq 0 \ \text{or} \ \beta_4 \neq 0\) at the \(\alpha\) significance level. If we fail to reject the null, we can conclude both parameters are statistically insignificant at the \(\alpha\) significance level with a single F-test rather than two individual T-tests. However, when we reject this null, we still need to carry out individual T-tests to determine if \(\beta_3 \neq 0\), \(\beta_4 \neq 0\), or both are non-zero.
Although our null and alternative hypothesis has equalities instead
of inequalities, there is no need to divide \(\alpha\) by two when obtaining the critical
value like we usually do for two tailed tests. This is because the
F-distribution only has one tail on the right side. See the graph of
various F-distributions for various degrees of freedom in the
F-Distribution
section above.
Let’s focus on the case of the test \(H_0: \beta_3 = \beta_4 = 0\) versus \(H_A: \beta_3 \neq 0 \ \text{or} \ \beta_4 \neq 0\). Why should we reject the null (be confident that one of these parameters is different from zero) when \(F > f_{1 - \alpha, q, n - k}\)?
Let’s think about exactly what this F-statistic is. The F-statistic is defined as \[\begin{align} F = \frac{(SSR_r - SSR_{ur}) / q}{SSR_{ur} / (n - k)} \end{align}\] which follows an F-distribution with \(q\) numerator degrees of freedom and \(n-k\) denominator degrees of freedom. \(SSR_r\) is the sum of the squared residuals from the restricted model, \(SSR_{ur}\) is the sum of the squared residuals from the unrestricted model, \(q\) is the number of restrictions (think about this as the number of equal signs in the null hypothesis), and \(n-k\) is the degrees of freedom where \(k\) is the number of estimated parameters (intercept included). When this F-statistic large (meaning we are more likely to reject the null) this means either \((SSR_r - SSR_{ur}) / q\) is large or \(SSR_{ur} / (n - k)\) is small. Let’s think about both cases:
When \((SSR_r - SSR_{ur}) / q\) is large either \((SSR_r - SSR_{ur})\) is large or \(q\) is small. When \((SSR_r - SSR_{ur})\) is large we know our restricted model has a larger sum of squared residuals relative to our unrestricted model indicating excluding \(\beta_3\) and \(\beta_4\) hurts our model’s performance so it is likely at least one of these parameters is non-zero (the null is false). \(q\) is simply a degrees of freedom correction ensuring our F-statistic follows an F-distribution
When \(SSR_{ur} / (n - k)\) is small either \(SSR_{ur}\) is small or \(n - k\) is large. When \(SSR_{ur}\) is small the unrestricted model has a small sum of squared residuals. This means it does a good job predicting our outcome indicating that \(\beta_3\) and \(\beta_4\) should remain in our model (the null is false). \(n-k\) is simply a degrees of freedom correction ensuring our F-statistic follows an F-distribution.
Now, let’s think about exactly what this critical value is. The critical value for a F-test is the value produced by the inverse CDF of the f-distribution with \(q\) numerator degrees of freedom and \(n-k\) denominator degrees of freedom when evaluated at \(1 - \alpha\). Think about it this way: The CDF takes a number as input and returns the probability that a randomly selected value from the distribution is less than or equal to that input number while the inverse CDF takes a probability as input and returns the corresponding value such that this value is greater than or equal to that proportion of the distribution. Mathematically, the critical value is defined as \[\begin{align} f_{1 - \alpha, q, n - k} &= F_{f_{q, n-k}}^{-1}\left(1 - \alpha \right). \end{align}\] where \(F_{f_{q, n-k}}^{-1}\) is the inverse CDF of the F-distribution with \(q\) numerator degrees of freedom and \(n-k\) denominator degrees of freedom (takes in a probability and outputs a number rather than the other way around).
When \(f_{1 - \alpha, q, n - k}\) is large our hypothesis test is harder to reject. This value is larger as \(\alpha\) decreases which is why a test at \(\alpha = 0.01\) is harder to reject than a test at \(\alpha = 0.1\). A large \(f_{1 - \alpha, q, n - k}\) means it is larger than most F-distributed random variables. So, when \(F > f_{1 - \alpha, q, n - k}\) this means \(F\) must be pretty large which implies either, as we discussed above, \((SSR_r - SSR_{ur}) / q\) is large or \(SSR_{ur} / (n - k)\) is small. In either case, this gives evidence the null is false.
lm()
Recall our regression summary:
# Show regression summary
summary(reg)
##
## Call:
## lm(formula = colgpa ~ hsgpa + act + study + study_sq, data = dt)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.80612 -0.30055 0.02035 0.33471 1.18006
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.023759108 0.174500199 -0.136 0.892
## hsgpa 0.648049922 0.053154305 12.192 < 0.0000000000000002 ***
## act 0.026520295 0.005488096 4.832 0.00000161 ***
## study 0.003921578 0.006589506 0.595 0.552
## study_sq -0.000002247 0.000168500 -0.013 0.989
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4653 on 809 degrees of freedom
## Multiple R-squared: 0.2591, Adjusted R-squared: 0.2555
## F-statistic: 70.73 on 4 and 809 DF, p-value: < 0.00000000000000022
Let’s conduct the test of \(H_0: \beta_1 = \beta_2 = \beta_3 = \beta_4 = 0\) versus \(H_A: \text{One of these coefficients is non-zero}\) at the \(\alpha = 0.01\) significance level. This corresponds to a restricted model of \[\begin{align} colgpa_i = \beta_0 + v_i \end{align}\] and an unrestricted model of \[\begin{align} colgpa_i = \beta_0 + \beta_1 hsgpa + \beta_2 act + \beta_3 study + \beta_4 study^2 + u_i. \end{align}\]
In the case of the restricted model, \(\displaystyle \widehat{\beta}_0 = \frac{1}{n} \sum_{i=1}^n colgpa_i = \overline{colgpa}\). So, we are essentially testing if the average of college GPA is a better predictor than trying to predict it with high school GPA, ACT scores, and hours studied per week.
In this specific case, the F-stat for this test of \(F = 70.73\) is reported by R
’s
summary()
function near the bottom with the associated
p-value of \(p = 2.2e^{-16}\) to the
right of it as well as the numerator degrees of freedom of \(q = 4\) and the denominator degrees of
freedom of \(n - k = 814 - 5 = 809\).
Since the p-value is much lower than \(\alpha
= 0.01\), we can reject the null hypothesis and conclude at least
one of the coefficients is non-zero. However, to conclude which of them
is non-zero we must carry out individual T-tests.
Let’s verify these results by computing the F-statistic and its p-value manually. Recall that the F-distribution is only right tailed so there is no need to multiply the p-value by two.
# Significance level
alpha <- 0.01
# Number of restrictions
q <- 4
# Number of estimated parameters of the unrestricted model
k_ur <- 5
# Number of estimated parameters of the restricted model
k_r <- k_ur - q
# Unrestricted regression
reg_ur <- lm(colgpa ~ hsgpa + act + study + study_sq, data = dt)
# Restricted regression
reg_r <- lm(colgpa ~ 1, data = dt)
# SSR for unrestricted model
SSR_ur <- sum(reg_ur$residuals^2)
# SSR for restricted model
SSR_r <- sum(reg_r$residuals^2)
# Numerator for F-statistic
F_stat_num <- (SSR_r - SSR_ur) / q
# Denominator for F-statistic
F_stat_denom <- SSR_ur / (n - k_ur)
# F-statistic
F_stat <- F_stat_num / F_stat_denom
# Show the F-statistic
cat("The F-statistic is: ", F_stat, "\n")
## The F-statistic is: 70.7337
# F-critical value
f_crit <- qf(1 - alpha, df1 = q, df2 = n - k)
# Show the F-critical value
cat("The F-critical value is: ", f_crit, "\n")
## The F-critical value is: 3.342404
# Use pf (CDF of F-distribution) to get the p-value
F_stat_p <- pf(F_stat, df1 = q, df2 = n - k, lower.tail = FALSE)
# Show the p-value on the F-statistic
cat("The p-value associated with the F-statistic is: ", F_stat_p)
## The p-value associated with the F-statistic is: 0.000000000000000000000000000000000000000000000000002183434
Notice this is identical to that produced by R
’s
summary()
function.
Since \(F = 70.73 > 3.34 = f_{1 - \alpha, q, n-k}\), we reject the null like the p-value indicates.
We could have used the formula \[\begin{align} F = \frac{R^2 / (k - 1)}{(1 - R^2) / (n - k)} \end{align}\] and gotten identical results because we are testing the null hypothesis of an intercept only model. Note, this formula only works when testing this specific null hypothesis. Let’s verify this.
# Number of estimated parameters including intercept
k <- 5
# Number of observations
n <- nrow(dt)
# Obtain outcome from our data table
y <- dt[, colgpa]
# Obtain the residuals
u_hat <- y - X %*% beta_hat
# Obtain the total sum of squares (SST)
SST <- sum((y - mean(y))^2)
# Get the residual sum of squares (SSR)
SSR <- sum((u_hat)^2)
# Use the SST and SSR to get the R-squared
r_squared <- 1 - (SSR / SST)
# Compute the numerator of the F-statistic
F_stat_num <- r_squared / (k - 1)
# Compute the denominator of the F-statistic
F_stat_denom <- (1 - r_squared) / (n - k)
# Compute the F-statistic
F_stat <- F_stat_num / F_stat_denom
# Show the F-statistic
cat("The F-statistic is: ", F_stat)
## The F-statistic is: 70.7337
Notice this is exactly identical to that produced by R
’s
lm()
function.
Moreover, let’s use R
’s anova
function to
replicate these results. This is very useful and does most of the work
for us.
# Unrestricted regression
reg_ur <- lm(colgpa ~ hsgpa + act + study + study_sq, data = dt)
# Restricted regression
reg_r <- lm(colgpa ~ 1, data = dt)
# F-Test Summary
anova(reg_r, reg_ur)
Notice in the second row of the table we get the F-statistic and the
p-value associated with it (click on the arrow to the right of
F
to see the p-value).
Let’s conduct the test of \(H_0: \beta_3 = \beta_4 = 0\) versus \(H_A: \text{One of these coefficients is non-zero}\) at the \(\alpha = 0.1\) significance level. This corresponds to a restricted model of \[\begin{align} colgpa_i = \beta_0 + \beta_1 hsgpa + \beta_2 act + v_i \end{align}\] and an unrestricted model of \[\begin{align} colgpa_i = \beta_0 + \beta_1 hsgpa + \beta_2 act + \beta_3 study + \beta_4 study^2 + u_i. \end{align}\]
# Significance level
alpha <- 0.1
# Number of restrictions
q <- 2
# Number of estimated parameters of the unrestricted model
k_ur <- 5
# Number of estimated parameters of the restricted model
k_r <- k_ur - q
# Unrestricted regression
reg_ur <- lm(colgpa ~ hsgpa + act + study + study_sq, data = dt)
# Restricted regression
reg_r <- lm(colgpa ~ hsgpa + act, data = dt)
# SSR for unrestricted model
SSR_ur <- sum(reg_ur$residuals^2)
# SSR for restricted model
SSR_r <- sum(reg_r$residuals^2)
# Numerator for F-statistic
F_stat_num <- (SSR_r - SSR_ur) / q
# Denominator for F-statistic
F_stat_denom <- SSR_ur / (n - k_ur)
# F-statistic
F_stat <- F_stat_num / F_stat_denom
# Show the F_statistic
cat("The F-statistic is: ", F_stat, "\n")
## The F-statistic is: 1.623042
# F-critical value
f_crit <- qf(1 - alpha, df1 = q, df2 = n - k)
# Show the F-critical value
cat("The F-critical value is: ", f_crit, "\n")
## The F-critical value is: 2.309151
# Use pf (CDF of F-distribution) to get the p-value
F_stat_p <- pf(F_stat, df1 = q, df2 = n - k, lower.tail = FALSE)
# Show the p-value on the F-statistic
cat("The p-value associated with the F-statistic is: ", F_stat_p)
## The p-value associated with the F-statistic is: 0.1979394
Since \(F = 1.62 < 2.31 = f_{1 - \alpha, q, n-k}\), we fail to reject the null and conclude that either \(\beta_3\) or \(\beta_4\) is equal to zero. The p-value of \(p = 0.19 > 0.10 = \alpha\) verifies this result.
Moreover, let’s use R
’s anova
function to
replicate these results. This is very useful and does most of the work
for us.
# Unrestricted regression
reg_ur <- lm(colgpa ~ hsgpa + act + study + study_sq, data = dt)
# Restricted regression
reg_r <- lm(colgpa ~ hsgpa + act, data = dt)
# F-Test Summary
anova(reg_r, reg_ur)
Notice in the second row of the table we get the F-statistic and the p-value associated with it.