date()

sapply(c("pipeR", "dplyr", "tidyr", "ggplot2", "Cairo", "readr", "readxl", "openxlsx"), require, character.only = TRUE)

Loading required package: pipeR
Warning message:
“package ‘pipeR’ was built under R version 3.2.4”Loading required package: dplyr

Attaching package: ‘dplyr’

 以下のオブジェクトは ‘package:stats’ からマスクされています: 

     filter, lag 

 以下のオブジェクトは ‘package:base’ からマスクされています: 

     intersect, setdiff, setequal, union 

Loading required package: tidyr
Loading required package: ggplot2
Loading required package: Cairo
Loading required package: readr
Loading required package: readxl
Warning message:
“package ‘readxl’ was built under R version 3.2.4”Loading required package: openxlsx

options(repr.plot.width = 4, repr.plot.height = 4)

Chapter 2 統計の基礎¶

2.1 尺度水準¶

名義尺度
順序尺度
間隔尺度
比例尺度・比率尺度

代表値¶

平均値 $\bar{X}$ （算術平均）

X <- c(0, 7, 8, 9, 100)
mean(X)

NA: 欠損値
NaN: Not a Number

Y <- c(0, 7, 8, NA, 0/0)
Y

mean(Y)

[1] NA

na.rm = TRUE は NaN も消す

mean(Y, na.rm = TRUE)

幾何平均
トリム平均

20%除去

mean(X, trim = 0.2)

mean(X[2:4])

中央値

median(X)

ウィンザライズド平均
ミッドレンジ

2.3 確率変数，乱数，母集団，標本¶

分散，母分散

$$\sigma^2 = V(X) = E((X - \mu)^2), \mu = E(X)$$

標準偏差

$$\sigma = \sqrt{E((X - \mu)^2}$$

確率変数$X$，$Y$が独立なら，和の分散は分散の和なので，

$$ \sigma^2_{X \pm Y} = \sigma^2_X + \sigma^2_Y $$

標本分散は $n-1$ で割る

$$\frac{1}{n-1}\sum\limits^n_{ti = 1}(X_i - \bar{X})^2$$

x <- 1:5
var(x)

どれか一つを外して分散を出す

apply(combn(c(1:5), 4), 2, var)

平均は元の分散と一致

apply(combn(c(1:5), 4), 2, var) %>>% mean()

$n$ で割る分散

varp <- function(x){var(x) * (length(x) - 1) / length(x)}

varp(x)

var(x)

標準偏差

x <- 1:10
sd(x)

四分位範囲（IQR），五数要約

x <- 1:9
IQR(x)

quantile(x)

fivenum(x)

y <- c(1, 2, 4, 8, 16, 32)
IQR(y)

quantile(y)

fivenum(y)

2.5 中心極限定理と正規分布¶

runif(1)

runif(10)

0.5引いて範囲を $-0.5 \leq x \leq 0.5$ に

分散は，

$$\int^{0.5}_{0.5} x^2dx = \frac{1}{12}$$

なので，12個足すと1になる

X <- runif(1000000) - 0.5
data.frame(x = X) %>>% 
    ggplot(aes(x = x, y = ..density..)) + 
        geom_histogram(bins = 50, fill = "gray", colour = "white") + 
        theme_bw() + 
        scale_x_continuous(limits = c(-2.1, 2.1))

X <- (runif(1000000) - 0.5) + (runif(1000000) - 0.5)
data.frame(x = X) %>>% 
    ggplot(aes(x = x, y = ..density..)) + 
        geom_histogram(bins = 50, fill = "gray", colour = "white") + 
        theme_bw() + 
        scale_x_continuous(limits = c(-2.1, 2.1))

X <- (runif(1000000) - 0.5) + (runif(1000000) - 0.5) + (runif(1000000) - 0.5)
data.frame(x = X) %>>% 
    ggplot(aes(x = x, y = ..density..)) + 
        geom_histogram(bins = 50, fill = "gray", colour = "white") + 
        theme_bw() + 
        scale_x_continuous(limits = c(-2.1, 2.1))

c(1:12) %>>% 
    lapply(function(x){
        runif(10) - 0.5}
    ) %>>% 
    {do.call(rbind, (.))} %>>%
    apply(2, sum)

X <- c(1:12) %>>% 
        lapply(function(x){
            runif(1000000) - 0.5}
        ) %>>% 
        {do.call(rbind, (.))} %>>%
        apply(2, sum)

data.frame(x = X) %>>% 
    ggplot(aes(x)) + 
        geom_histogram(aes(y = ..density..), bins = 50, fill = "gray", colour = "white") + 
        theme_bw() + 
        scale_x_continuous(limits = c(-5, 5)) + 
        stat_function(fun = dnorm)

標準正規分布

$$ f(x) = \frac{1}{\sqrt{2\pi}}\exp\left(-\frac{x^2}{2}\right) $$

$-\infty$ から $\infty$ まで積分すると1になる

正規分布の確率密度関数: dnorm(x, mean = 0, sd = 1)

dnorm(1)

(1 / sqrt(2 * pi)) * exp(-1^2 / 2)

$$\int^{\infty}_{-\infty} -\frac{x^2}{2}dx$$

integrate(function(x){exp(-x^2/2)}, -Inf, Inf)

2.506628 with absolute error < 0.00023

sqrt(2 * pi)

中心極限定理

平均 $\mu$，分散 $\sigma^2$の確率変数$X$から取り出した数の平均値$\bar{X}$

$$\bar{X} = \frac{X_1 + X_2 + \ldots + X_n}{n}$$

の分布は，平均$\mu$，分散$\sigma^2/n$なので

$$ \frac{\bar{X} - \mu}{\sqrt{\sigma^$/n}} $$

の分布は平均 0 分散 1になる． nが十分大きくなると標準正規分布に近づく．

確率分布に関するRの関数: dxx, pxx, qxx, rxx

正規分布の場合は，

密度関数: dnorm(x)
分布関数: pnorm(q) = $\int^q_{-\infty}\mathrm{dnorm}(x)dx$
分位関数: qnorm(p)
乱数発生: rnorm(n)

正規分布$\mathcal{N}(\mu, \sigma^2)$ に従う$X$が $\mu - \sigma \lt X \lt \mu + \sigma$ に入る確率

$\mathcal{N}(0, 1)$ に従う$Z$が $-1 \lt Z \lt 1$ に入る確率に等しい

pnorm(1) - pnorm(-1)

1 - 2 * pnorm(-1)

2$\sigma$

1 - 2 * pnorm(-2)

3$\sigma$

1 - 2 * pnorm(-3)

95%

qnorm(0.95)

99%

qnorm(0.99)

テストの分布

x <- read_csv("data/rika_hist.csv")
str(x)

Classes ‘tbl_df’, ‘tbl’ and 'data.frame':	26 obs. of  2 variables:
 $ 正答数: int  0 1 2 3 4 5 6 7 8 9 ...
 $ 生徒数: int  2247 4449 10774 19983 29385 37881 44207 48518 51221 53379 ...

names(x) <- c("correct", "student")

平均

rep(x$correct, x$student) %>>% mean()

標準偏差

rep(x$correct, x$student) %>>% sd()

Cairo(type = "raster")
CairoFonts(regular = "IPAexGothic")
x %>>% ggplot(aes(x = correct, y = student / sum(student))) + 
    geom_bar(stat="identity", colour = "white", fill = "gray") + 
    theme_bw() + 
    scale_x_continuous(breaks = seq(0, 25, 5)) +
    scale_y_continuous(limits = c(0, 0.07), breaks = seq(0, 0.07, 0.01)) + 
    stat_function(fun = dnorm, args = list(mean = 13.37, sd = 5.74)) + 
    geom_vline(xintercept = 13.37, size = 0.3) + 
    xlab("正答数") + 
    ylab("生徒数の割合")
dev.off()

2.6 コーシー分布¶

中心極限定理の前提を満たさない
密度関数が$1/(1 + ax^2)$ に比例

ggplot(data.frame(x = c(-3, 3)), aes(x)) + 
    stat_function(fun = dnorm) + 
    annotate(label = "Normal", geom = "text", x = 1.2, y = 0.35) + 
    stat_function(fun = dcauchy, size = 1) + 
    annotate(label = "Cauchy", geom = "text", x = 0.3, y = 0.16) + 
    scale_x_continuous(breaks = seq(-3, 3, 1)) + 
    scale_y_continuous(limits = c(0, 0.4)) + 
    theme_bw()

x <- rcauchy(1000000)

hist(x)

sort(x) %>>% head()

mean(x)

sd(x)

2.7 正規分布から導かれる分布¶

カイ二乗分布¶

自由度$v$のカイ二乗分布

$X$ は $\mathcal{N}(0, 1)$ に従う

$$\chi^2 = X_1^2 + X_2^2 + \ldots + X_v^2$$

ggplot(data.frame(x = rnorm(1000000)), aes(x)) + 
    geom_histogram()

`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

自由度1のカイ二乗分布

ggplot(data.frame(x = rnorm(1000000)^2), aes(x)) + 
    geom_histogram(aes(y = ..density..), colour = "black", fill = gray(0.8)) + 
    stat_function(fun = dchisq, args = list(df = 1)) + 
    theme_bw() + 
    scale_x_continuous(limits = c(0, 8)) + 
    scale_y_continuous(limits = c(0, 1))

`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Warning message:
“Removed 4753 rows containing non-finite values (stat_bin).”

自由度2

ggplot(data.frame(x = rnorm(1000000)^2 + rnorm(1000000)^2), aes(x)) + 
    geom_histogram(aes(y = ..density..), colour = "black", fill = gray(0.8)) + 
    stat_function(fun = dchisq, args = list(df = 2)) + 
    theme_bw() + 
    scale_x_continuous(limits = c(0, 8)) + 
    scale_y_continuous(limits = c(0, 1))

`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Warning message:
“Removed 18315 rows containing non-finite values (stat_bin).”

$t$ 分布¶

$X$ が $\mathcal{N}(0, 1)$，$Y$が自由度$v$の$\chi^2$分布に従うとき自由度$v$の$t$分布は，

$$ t = \frac{X}{\sqrt{Y / v}} $$

fun.t <- function(x){
    (mean(x)-0.5)/(sd(x)/sqrt(12))
}
t <- c(1:12) %>>% 
    lapply(function(x){runif(1000000)}) %>>% 
    {do.call(rbind, (.))} %>>%
    apply(2, fun.t)

ggplot(data.frame(x = t), aes(x)) + 
    geom_histogram(aes(y = ..density..), colour = "black", fill = "gray", size = 0.1) + 
    theme_bw() + 
    scale_x_continuous(breaks =seq(-4, 4, 2), limits = c(-5, 5)) + 
    stat_function(fun = dnorm, size = 0.3, linetype = 2) + 
    stat_function(fun = dt, args = list(df = 11), size = 0.3)

`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Warning message:
“Removed 1134 rows containing non-finite values (stat_bin).”

$F$ 分布¶

$u$ は自由度$v$ の$\chi^2$分布に従う

$$ F = \frac{u_1 / v_1}{u_2 / v_2} $$

ggplot(data.frame(x = c(0, 5)), aes(x)) + 
    stat_function(fun = df, args = list(df1 = 1, df2 = 10)) + 
    stat_function(fun = df, args = list(df1 = 5, df2 = 10), linetype = 2)

devtools::session_info()

Session info -------------------------------------------------------------------
Packages -----------------------------------------------------------------------

 setting  value                       
 version  R version 3.2.3 (2015-12-10)
 system   x86_64, darwin13.4.0        
 ui       X11                         
 language (EN)                        
 collate  ja_JP.UTF-8                 
 tz       Asia/Tokyo                  
 date     2016-10-01                  

 package    * version    date       source                            
 assertthat   0.1        2013-12-06 CRAN (R 3.2.0)                    
 Cairo      * 1.5-9      2015-09-26 CRAN (R 3.2.0)                    
 colorspace   1.2-6      2015-03-11 CRAN (R 3.2.0)                    
 crayon       1.3.2      2016-06-28 CRAN (R 3.2.5)                    
 DBI          0.4-1      2016-05-08 CRAN (R 3.2.5)                    
 devtools     1.12.0     2016-06-24 CRAN (R 3.2.5)                    
 digest       0.6.10     2016-08-02 CRAN (R 3.2.5)                    
 dplyr      * 0.4.3      2015-09-01 CRAN (R 3.2.0)                    
 evaluate     0.9        2016-04-29 CRAN (R 3.2.5)                    
 ggplot2    * 2.1.0.9001 2016-10-01 Github (hadley/ggplot2@feb3ffd)   
 gtable       0.2.0      2016-02-26 CRAN (R 3.2.3)                    
 IRdisplay    0.4.4      2016-08-02 CRAN (R 3.2.5)                    
 IRkernel     0.7        2016-09-22 Github (IRkernel/IRkernel@16ef3ed)
 jsonlite     1.1        2016-09-14 cran (@1.1)                       
 labeling     0.3        2014-08-23 CRAN (R 3.2.0)                    
 lazyeval     0.2.0      2016-06-12 cran (@0.2.0)                     
 magrittr     1.5        2014-11-22 CRAN (R 3.2.0)                    
 memoise      1.0.0      2016-01-29 CRAN (R 3.2.3)                    
 munsell      0.4.3      2016-02-13 CRAN (R 3.2.3)                    
 openxlsx   * 3.0.0      2015-07-03 CRAN (R 3.2.0)                    
 pbdZMQ       0.2-4      2016-09-22 CRAN (R 3.2.3)                    
 pipeR      * 0.6.1.3    2016-04-04 CRAN (R 3.2.4)                    
 plyr         1.8.4      2016-06-08 cran (@1.8.4)                     
 R6           2.1.2      2016-01-26 CRAN (R 3.2.3)                    
 Rcpp         0.12.7     2016-09-05 cran (@0.12.7)                    
 readr      * 0.2.2      2015-10-22 CRAN (R 3.2.0)                    
 readxl     * 0.1.1      2016-03-28 CRAN (R 3.2.4)                    
 repr         0.9        2016-07-24 CRAN (R 3.2.5)                    
 scales       0.4.0      2016-02-26 CRAN (R 3.2.3)                    
 stringi      1.0-1      2015-10-22 CRAN (R 3.2.0)                    
 stringr      1.0.0      2015-04-30 CRAN (R 3.2.0)                    
 tibble       1.2        2016-08-26 cran (@1.2)                       
 tidyr      * 0.4.1      2016-02-05 CRAN (R 3.2.3)                    
 uuid         0.1-2      2015-07-28 CRAN (R 3.2.0)                    
 withr        1.0.1      2016-02-04 CRAN (R 3.2.3)