pkgs <- c("pipeR", "dplyr", "tidyr", "ggplot2", "readr", "readxl")
lapply(pkgs, require, character.only = TRUE)
平均値差の統計分布は,
$$ \bar{X_1} - \bar{X_2} \sim N\left( \mu_1 - \mu_2, \sigma^2 \left( \frac{1}{n_1} + \frac{1}{n_2} \right) \right)$$となる.
標準化すると,
$$\frac{\bar{X_1} - \bar{X_2} - (\mu_1 - \mu_2)}{\sigma \sqrt{\frac{1}{n_1} + \frac{1}{n_2}}} \sim N(0, 1)$$となる.
母標準偏差 $\sigma$ は次の $\hat{\sigma}^2_\mathrm{pooled}$ で推定する.
$$ \hat{\sigma}^2_\mathrm{pooled} = \frac{(n_1 - 1) \hat{\sigma}_1^2 + (n_2 - 1) \hat{\sigma}_2^2}{n_1 + n_2 - 2}$$検定統計量 $t$ は,以下のようになる.
$$ t = \frac{\bar{X_1} - \bar{X_2}}{\sqrt{\frac{(n_1 - 1)\hat{\sigma}_1^2 + (n_2 - 1)\hat{\sigma}_2^2}{n_1 + n_2 - 2}\left(\frac{1}{n_1} + \frac{1}{n_2}\right)}}$$$t$ は帰無仮説 $\mu_1 = \mu_2$ のもとで,自由度n1 + n2 - 2 の $t$ 分布に従う.
d <- read_csv("teaching_methods.csv")
names(d) <- c("id", "name", "sex","math", "stat", "psy_test", "stat_test1", "stat_test2", "teaching_method")
str(d)
stat_test1.male <- d %>>% filter(sex == "男") %>>% (stat_test1)
stat_test1.female <- d %>>% filter(sex == "女") %>>% (stat_test1)
stat_test1.male
stat_test1.female
検定統計量 $t$
n1 <- length(stat_test1.male)
n2 <- length(stat_test1.female)
t.a <- mean(stat_test1.male) - mean(stat_test1.female)
t.b <- (n1 - 1)*var(stat_test1.male) + (n2 - 1) * var(stat_test1.female)
t.c <- n1 + n2 - 2
sigma.pooled <- sqrt(t.b / t.c)
t.d <- (1 / n1 + 1 / n2)
t <- t.a / sigma.pooled / sqrt(t.d)
t
棄却域
lower <- qt(0.05 / 2, n1 + n2 - 2)
upper <- qt(0.05 / 2, n1 + n2 - 2, lower.tail = FALSE)
cat("t < ", lower, ", t > ", upper)
options(repr.plot.width = 4, repr.plot.height = 4)
ggplot(data.frame(x = c(-3, 3)), aes(x)) +
stat_function(fun = dt, args = list(df = n1 + n2 - 2)) +
geom_vline(xintercept = c(qt(0.025, df = n1 + n2 - 2), qt(0.975, df = n1 + n2 - 2)))
帰無仮説を棄却できない. 男女で統計テスト1の得点に有意差があるとは言えない.
$p$ 値は,
2 * pt(t, df = n1 + n2 - 2)
t.test()
を使う
t.test(stat_test1.male, stat_test1.female, var.equal = TRUE)
classA <- c(54, 55, 52, 48, 50, 38, 41, 40, 53, 52)
classB <- c(67, 63, 50, 60, 61, 69, 43, 58, 36, 29)
var.test(classA, classB)
帰無仮説(母分散が等質である)が棄却されたので,Welchの$t$検定を用いる.
t.test(classA, classB, var.equal = FALSE)
帰無仮説を棄却できないので,平均に差があるとは言えない
グループ1の得点をX1,グループ2の得点をX2とすると,差Dは,
$$D = X_2 - X_1$$となる.標本平均を考えると,
$$ \bar{D} = \bar{X}_2 - \bar{X}_1 $$となる.
stat_test1 <- d$stat_test1
stat_test2 <- d$stat_test2
diff.stat12 <- stat_test2 - stat_test1
diff.stat12
mean(stat_test1)
mean(stat_test2)
mean(diff.stat12)
統計検定量は,
$$ D \sim N \left( \mu_D, \sigma^2_D \right) $$とすれば
$$ \bar{D} \sim N \left( \mu_D, \frac{\sigma^2_D}{n} \right) $$これを標準化すると
$$ Z = \frac{\bar{D} - \mu_D}{\sigma_D / \sqrt{n}} $$は $N(0, 1)$ にしたがう.
よって,
$$ t = \frac{\bar{D} - \mu_D}{\hat{\sigma}_D / \sqrt{n}} $$は $df$ = n - 1 の$t$ 分布にしたがう.
n <- length(diff.stat12)
a <- mean(diff.stat12)
b <- sd(diff.stat12) / sqrt(n)
t <- a / b
t
棄却域は,
lower <- qt(0.05 / 2, df = n - 1)
upper <- qt(0.05 / 2, df = n - 1, lower.tail = FALSE)
cat("t < ", lower, ", t > ", upper)
なので,棄却域に入る
options(repr.plot.width = 4, repr.plot.height = 4)
ggplot(data.frame(x = c(-3, 3)), aes(x)) +
stat_function(fun = dt, args = list(df = n - 1)) +
scale_x_continuous(breaks = c(-3:3, 1)) +
geom_vline(xintercept = c(qt(0.025, df = n - 1), qt(0.975, df = n - 1)))
t.test(diff.stat12)
t.test(stat_test1, stat_test2, paired = TRUE)
var.test(stat_test1, stat_test2)
分散は同じだとは言えない
t.test(stat_test1, stat_test2, var.equal = FALSE)
stat_test1.like <- d %>>% filter(stat == "好き") %>>% (stat_test1)
stat_test1.dislike <- d %>>% filter(stat == "嫌い") %>>% (stat_test1)
stat_test1.like
stat_test1.dislike
var.test(stat_test1.like, stat_test1.dislike)
t.test(stat_test1.like, stat_test1.dislike, var.equal = TRUE)
2群の平均値には差がある
psy_test.male <- d %>>% filter(sex == "男") %>>% (psy_test)
psy_test.female <- d %>>% filter(sex == "女") %>>% (psy_test)
psy_test.male
psy_test.female
var.test(psy_test.male, psy_test.female)
t.test(psy_test.male, psy_test.female, var.equal = TRUE)
男女で有意な差があるとは言えない
before <- stringr::str_split("61 50 41 55 51 48 46 55 65 70", pattern = " ")[[1]] %>>% as.numeric
after <- stringr::str_split("59 48 33 54 47 52 38 50 64 63", pattern = " ")[[1]] %>>% as.numeric
before
after
t.test(before, after, paired = TRUE)
前後の平均値が異なる
mean(before)
mean(after)
効果があった
devtools::session_info()