date()
sapply(c("pipeR", "dplyr", "tidyr", "ggplot2", "Cairo", "readr", "readxl", "openxlsx"), require, character.only = TRUE)
options(repr.plot.width = 4, repr.plot.height = 4)
X <- c(0, 7, 8, 9, 100)
mean(X)
Y <- c(0, 7, 8, NA, 0/0)
Y
mean(Y)
na.rm = TRUE は NaN も消す
mean(Y, na.rm = TRUE)
20%除去
mean(X, trim = 0.2)
mean(X[2:4])
median(X)
確率変数$X$,$Y$が独立なら,和の分散は分散の和なので,
$$ \sigma^2_{X \pm Y} = \sigma^2_X + \sigma^2_Y $$標本分散は $n-1$ で割る
$$\frac{1}{n-1}\sum\limits^n_{ti = 1}(X_i - \bar{X})^2$$x <- 1:5
var(x)
どれか一つを外して分散を出す
apply(combn(c(1:5), 4), 2, var)
平均は元の分散と一致
apply(combn(c(1:5), 4), 2, var) %>>% mean()
$n$ で割る分散
varp <- function(x){var(x) * (length(x) - 1) / length(x)}
varp(x)
var(x)
x <- 1:10
sd(x)
x <- 1:9
IQR(x)
quantile(x)
fivenum(x)
y <- c(1, 2, 4, 8, 16, 32)
IQR(y)
quantile(y)
fivenum(y)
runif(1)
runif(10)
0.5引いて範囲を $-0.5 \leq x \leq 0.5$ に
分散は,
$$\int^{0.5}_{0.5} x^2dx = \frac{1}{12}$$なので,12個足すと1になる
X <- runif(1000000) - 0.5
data.frame(x = X) %>>%
ggplot(aes(x = x, y = ..density..)) +
geom_histogram(bins = 50, fill = "gray", colour = "white") +
theme_bw() +
scale_x_continuous(limits = c(-2.1, 2.1))
X <- (runif(1000000) - 0.5) + (runif(1000000) - 0.5)
data.frame(x = X) %>>%
ggplot(aes(x = x, y = ..density..)) +
geom_histogram(bins = 50, fill = "gray", colour = "white") +
theme_bw() +
scale_x_continuous(limits = c(-2.1, 2.1))
X <- (runif(1000000) - 0.5) + (runif(1000000) - 0.5) + (runif(1000000) - 0.5)
data.frame(x = X) %>>%
ggplot(aes(x = x, y = ..density..)) +
geom_histogram(bins = 50, fill = "gray", colour = "white") +
theme_bw() +
scale_x_continuous(limits = c(-2.1, 2.1))
c(1:12) %>>%
lapply(function(x){
runif(10) - 0.5}
) %>>%
{do.call(rbind, (.))} %>>%
apply(2, sum)
X <- c(1:12) %>>%
lapply(function(x){
runif(1000000) - 0.5}
) %>>%
{do.call(rbind, (.))} %>>%
apply(2, sum)
data.frame(x = X) %>>%
ggplot(aes(x)) +
geom_histogram(aes(y = ..density..), bins = 50, fill = "gray", colour = "white") +
theme_bw() +
scale_x_continuous(limits = c(-5, 5)) +
stat_function(fun = dnorm)
$-\infty$ から $\infty$ まで積分すると1になる
正規分布の確率密度関数: dnorm(x, mean = 0, sd = 1)
dnorm(1)
(1 / sqrt(2 * pi)) * exp(-1^2 / 2)
integrate(function(x){exp(-x^2/2)}, -Inf, Inf)
sqrt(2 * pi)
中心極限定理
平均 $\mu$, 分散 $\sigma^2$の確率変数$X$から取り出した数の平均値$\bar{X}$
$$\bar{X} = \frac{X_1 + X_2 + \ldots + X_n}{n}$$の分布は,平均$\mu$,分散$\sigma^2/n$なので
$$ \frac{\bar{X} - \mu}{\sqrt{\sigma^$/n}} $$の分布は平均 0 分散 1になる. nが十分大きくなると標準正規分布に近づく.
確率分布に関するRの関数: dxx, pxx, qxx, rxx
正規分布の場合は,
dnorm(x)pnorm(q) = $\int^q_{-\infty}\mathrm{dnorm}(x)dx$qnorm(p)rnorm(n)正規分布$\mathcal{N}(\mu, \sigma^2)$ に従う$X$が $\mu - \sigma \lt X \lt \mu + \sigma$ に入る確率
$\mathcal{N}(0, 1)$ に従う$Z$が $-1 \lt Z \lt 1$ に入る確率に等しい
pnorm(1) - pnorm(-1)
1 - 2 * pnorm(-1)
2$\sigma$
1 - 2 * pnorm(-2)
3$\sigma$
1 - 2 * pnorm(-3)
95%
qnorm(0.95)
99%
qnorm(0.99)
x <- read_csv("data/rika_hist.csv")
str(x)
names(x) <- c("correct", "student")
平均
rep(x$correct, x$student) %>>% mean()
標準偏差
rep(x$correct, x$student) %>>% sd()
Cairo(type = "raster")
CairoFonts(regular = "IPAexGothic")
x %>>% ggplot(aes(x = correct, y = student / sum(student))) +
geom_bar(stat="identity", colour = "white", fill = "gray") +
theme_bw() +
scale_x_continuous(breaks = seq(0, 25, 5)) +
scale_y_continuous(limits = c(0, 0.07), breaks = seq(0, 0.07, 0.01)) +
stat_function(fun = dnorm, args = list(mean = 13.37, sd = 5.74)) +
geom_vline(xintercept = 13.37, size = 0.3) +
xlab("正答数") +
ylab("生徒数の割合")
dev.off()
ggplot(data.frame(x = c(-3, 3)), aes(x)) +
stat_function(fun = dnorm) +
annotate(label = "Normal", geom = "text", x = 1.2, y = 0.35) +
stat_function(fun = dcauchy, size = 1) +
annotate(label = "Cauchy", geom = "text", x = 0.3, y = 0.16) +
scale_x_continuous(breaks = seq(-3, 3, 1)) +
scale_y_continuous(limits = c(0, 0.4)) +
theme_bw()
x <- rcauchy(1000000)
hist(x)
sort(x) %>>% head()
mean(x)
sd(x)
ggplot(data.frame(x = rnorm(1000000)), aes(x)) +
geom_histogram()
ggplot(data.frame(x = rnorm(1000000)^2), aes(x)) +
geom_histogram(aes(y = ..density..), colour = "black", fill = gray(0.8)) +
stat_function(fun = dchisq, args = list(df = 1)) +
theme_bw() +
scale_x_continuous(limits = c(0, 8)) +
scale_y_continuous(limits = c(0, 1))
ggplot(data.frame(x = rnorm(1000000)^2 + rnorm(1000000)^2), aes(x)) +
geom_histogram(aes(y = ..density..), colour = "black", fill = gray(0.8)) +
stat_function(fun = dchisq, args = list(df = 2)) +
theme_bw() +
scale_x_continuous(limits = c(0, 8)) +
scale_y_continuous(limits = c(0, 1))
$X$ が $\mathcal{N}(0, 1)$,$Y$が自由度$v$の$\chi^2$分布に従うとき自由度$v$の$t$分布は,
$$ t = \frac{X}{\sqrt{Y / v}} $$fun.t <- function(x){
(mean(x)-0.5)/(sd(x)/sqrt(12))
}
t <- c(1:12) %>>%
lapply(function(x){runif(1000000)}) %>>%
{do.call(rbind, (.))} %>>%
apply(2, fun.t)
ggplot(data.frame(x = t), aes(x)) +
geom_histogram(aes(y = ..density..), colour = "black", fill = "gray", size = 0.1) +
theme_bw() +
scale_x_continuous(breaks =seq(-4, 4, 2), limits = c(-5, 5)) +
stat_function(fun = dnorm, size = 0.3, linetype = 2) +
stat_function(fun = dt, args = list(df = 11), size = 0.3)
$u$ は 自由度$v$ の$\chi^2$分布に従う
$$ F = \frac{u_1 / v_1}{u_2 / v_2} $$ggplot(data.frame(x = c(0, 5)), aes(x)) +
stat_function(fun = df, args = list(df1 = 1, df2 = 10)) +
stat_function(fun = df, args = list(df1 = 5, df2 = 10), linetype = 2)
devtools::session_info()