date()
sapply(c("pipeR", "ggplot2", "dplyr", "tidyr", "readr"), require, character.only = TRUE)
options(repr.plot.width = 4, repr.plot.height = 4)
例題
例題データ読み込み(http://hosho.ees.hokudai.ac.jp/%7Ekubo/stat/iwanamibook/fig/distribution/data.RData)
dir(".", full.names = TRUE) %>>% cat(sep = "\n")
dir("./data", full.names = TRUE, recursive = TRUE) %>>% cat(sep = "\n")
ls.str()
load("data/chap02/data.RData")
ls.str()
データはすべて非負の整数
data
個数は50(50個体分のデータ)
length(data)
標本平均は3.56
summary(data)
度数分布
table(data)
ヒストグラムを描く
data_frame(data = data) %>>%
ggplot(aes(x = data)) +
geom_histogram(binwidth = 1, colour = "black", fill = gray(0.6)) +
scale_y_continuous(breaks = seq(0, 12, 2)) +
scale_x_continuous(breaks = seq(0, 8, 2), limits = c(-0.5, 8.1)) +
labs(title = "Histogram of data") +
ylab("Frequency")
標本分散
var(data)
標本標準偏差
sd(data)
var(data) %>>% sqrt
平均 3.56 のポアソン分布は,
y <- 0:9
prob <- dpois(y, lambda = 3.56)
cbind(y, prob)
data_frame(y, prob) %>>%
ggplot(aes(y, prob)) +
geom_line(linetype = 2) +
geom_point() +
scale_x_continuous(breaks = seq(0, 8, 2))
観測データのヒストグラムと確率分布を重ねる(確率分布 $\times$ 個体数)
ggplot() +
geom_histogram(data = data_frame(x = data), aes(x = x),
colour = "black", fill = gray(0.6), binwidth = 1) +
geom_line(data = data_frame(y, prob = prob * 50), aes(x = y, y = prob), linetype = 2) +
geom_point(data = data_frame(y, prob = prob * 50), aes(x = y, y = prob),
shape = 21, fill = "white") +
labs(title = "Histogram of data") +
ylab("Frequency") +
xlab("data") +
scale_x_continuous(breaks = seq(0, 8, 2)) +
scale_y_continuous(breaks = seq(0, 12, 2))
定義は,
$$p(y | \lambda) = \frac{\lambda^y \exp(-\lambda)}{y!} $$性質は,
種子数のデータを表現するのにポアソン分布を選んだのは,
パラメータ $\lambda$ を変化させると,
data_frame(x = c(0:20), y1 = dpois(x, lambda = 3.5), y2 = dpois(x, lambda = 7.7),
y3 = dpois(x, lambda = 15.1)) %>>%
gather(lambda, val, -x) %>>%
ggplot(aes(x = x, y = val, shape = lambda)) +
geom_line() +
geom_point(fill = "white") +
ylab("prob") +
xlab("y") +
scale_shape_manual(values = c(21, 23, 24), labels = c("3.5", "7.7", "15.1")) +
theme(legend.position=c(.9, .85))
$y_i$,$y_2$,$y_3$についてなら,尤度は,
data[1:3]
dpois(data[1:3], lambda = 3.56)
dpois(data[1:3], lambda = 3.56) %>>% prod()
平均 $\lambda$ を変化させたときにポアソン分布の形状と対数尤度がどのように変化するか
対数尤度にして計算
logL <- function(m){
dpois(data, m, log = TRUE) %>>% sum
}
gpPois <- function(lambda){
gp <- ggplot() +
geom_histogram(data = data_frame(x = data), aes(x = x),
colour = "black", fill = gray(0.6), binwidth = 1) +
geom_line(data = data_frame(y, prob = dpois(y, lambda = lambda) * 50), aes(x = y, y = prob),
linetype = 2) +
geom_point(data = data_frame(y, prob = dpois(y, lambda = lambda) * 50), aes(x = y, y = prob),
shape = 21, fill = "white") +
scale_x_continuous(breaks = seq(0, 8, 2)) +
scale_y_continuous(breaks = seq(0, 15, 5), limits = c(0, 15)) +
annotate(x = 6, y = 13, hjust = 0, vjust = 0, geom = "text", size = 3,
label = sprintf("lambda=%.1f\nlog L=%.1f", lambda, logL(lambda))) +
theme(axis.title = element_blank())
return(gp)
}
gps <- seq(2.0, 5.2, 0.4) %>>% lapply(gpPois)
library(gridExtra)
options(repr.plot.width = 8, repr.plot.height = 8)
$\lambda = 3.6$ のとき $\log L = -97.3$ で最大
do.call(grid.arrange, c(gps, ncol = 3))
対数尤度と$\lambda$の関係
options(repr.plot.width = 4, repr.plot.height = 4)
lambda = seq(2, 5, 0.1)
data_frame(lambda, logL = sapply(lambda, logL)) %>>%
ggplot(aes(x = lambda, y = logL)) +
geom_line() +
geom_vline(xintercept = 3.56, linetype = 2) +
scale_x_continuous(breaks = seq(2, 5, 0.5))
対数尤度が最大となる$\lambda$ を $\hat\lambda$ とする. 対数尤度関数が最大値で傾きゼロとなるとなる$\lambda$を探せばよい
$$ \begin{align} \frac{\partial \log L(\lambda)}{\partial \lambda} &= \sum\limits_{i}\left\{ \frac{y_i}{\lambda} - 1 \right\} \\ &= \frac{1}{\lambda}\sum\limits_{i}y_i - 50 \end{align} $$これが 0になるときの $\lambda$ が $\hat\lambda$ なので,解くと, $$ \hat\lambda = \frac{1}{50}\sum\limits_{i}y_i $$ これは標本平均のことなので $\hat\lambda = 3.56$ となる
一般化すると,パラメータ$\theta$の確率分布から観測データ$y_i$が発生した時の確率を$p(y_i | \theta)$とすると尤度と対数尤度は,
$$ L(\theta | \boldsymbol{Y}) = \prod\limits_i p(y_i | \theta) $$$$ \log L(\theta | \boldsymbol{Y}) = \sum\limits_i \log p(y_i | \theta) $$となり,最尤推定で尤度(対数尤度)最大の$\hat{\theta}$を探す
binom.test(4, 10)
この区間の中で,尤度が最大となる$\theta$を求める(最尤推定).
対数尤度は次のようになる.
$$\log L(\theta) = \log({}_{10}\mathrm{C}_4) + 4\log\theta + 6\log(1 - \theta)$$これをプロットすると(定数項は除く)
logL <- function(t){
4 * log(t) + 6 * log(1 - t)
}
ggplot(data_frame(x = c(0, 1)), aes(x)) +
stat_function(fun = logL) +
xlab(expression(theta)) +
ylab(expression("L("~theta~")")) +
geom_vline(xintercept = c(0.12, 0.74), linetype = 2)
$L(\theta)$ が最大になる$\theta$を求める.
$$ \frac{d}{d\theta} \log L(\theta) = \frac{4}{\theta} - \frac{6}{1-\theta} $$が0になる点なので,$\theta = 0.4$
(10枚中4枚表が出たので,表が出る確率は0.4)
data_frame(y = sapply(c(1:3000), function(x){
rpois(50, lambda = 3.5) %>>% mean
})) %>>% ggplot(aes(x = y)) +
geom_histogram(colour = "black", fill = gray(0.6), binwidth = 0.1) +
scale_x_continuous(limits = c(2.5, 4.5), breaks = seq(2.5, 4.5, 0.5)) +
xlab(expression("Estimated "~lambda~"for each trial"))
調査個体数が多ければ標準誤差は小さくなる
data_frame(y = sapply(c(1:3000), function(x){
rpois(500, lambda = 3.5) %>>% mean
})) %>>% ggplot(aes(x = y)) +
geom_histogram(colour = "black", fill = gray(0.6), binwidth = 0.1) +
scale_x_continuous(limits = c(2.5, 4.5), breaks = seq(2.5, 4.5, 0.5)) +
xlab(expression("Estimated "~lambda~"for each trial"))
ggplot(data_frame(x = c(0:40)), aes(x)) +
stat_function(geom="point", n=41, fun = dbinom, args = list(size = 10, prob = 0.4), colour = "blue") +
stat_function(geom="point", n=41, fun = dbinom, args = list(size = 20, prob = 0.7), colour = "green") +
stat_function(geom="point", n=41, fun = dbinom, args = list(size = 40, prob = 0.5), colour = "red")
ggplot(data_frame(x = c(0, 20)), aes(x)) +
stat_function(fun = dgamma, args = list(shape = 1, rate = 0.5), colour = "red") +
stat_function(fun = dgamma, args = list(shape = 2, rate = 0.5), colour = "orange") +
stat_function(fun = dgamma, args = list(shape = 1, rate = 1), colour = "blue") +
stat_function(fun = dgamma, args = list(shape = 2, rate = 1), colour = "green")
devtools::session_info()