Rによるやさしい統計学

第I部

第2章

In [1]:
options(repr.plot.width = 4, repr.plot.height = 4)
In [2]:
teaching <- c("C","B","B","A","B","C","A","D","D","D","A", "A", "A","C","B","D","C","C","B","D")
In [3]:
table(teaching)
Out[3]:
teaching
A B C D 
5 5 5 5 
In [4]:
psychology <- c(13,14,7,12,10,6,8,15,4,14,9,6,10,12,5,12,8,8,12,15)
In [5]:
library(ggplot2)
Warning message:
: package ‘ggplot2’ was built under R version 3.2.4
In [6]:
sort(psychology)
Out[6]:
  1. 4
  2. 5
  3. 6
  4. 6
  5. 7
  6. 8
  7. 8
  8. 8
  9. 9
  10. 10
  11. 10
  12. 12
  13. 12
  14. 12
  15. 12
  16. 13
  17. 14
  18. 14
  19. 15
  20. 15
In [7]:
hist(psychology)
In [8]:
hist(psychology, plot=F)
Out[8]:
$breaks
[1]  4  6  8 10 12 14 16

$counts
[1] 4 4 3 4 3 2

$density
[1] 0.100 0.100 0.075 0.100 0.075 0.050

$mids
[1]  5  7  9 11 13 15

$xname
[1] "psychology"

$equidist
[1] TRUE

attr(,"class")
[1] "histogram"
In [9]:
ggplot(data = data.frame(psychology), aes(x = psychology)) + 
    geom_histogram(breaks = hist(psychology, plot=FALSE)$breaks)
  • 平均値
In [10]:
testA <- c(10,13,8,15,8)
sum(testA)
Out[10]:
54
In [11]:
sum(testA) / length(testA)
Out[11]:
10.8
In [12]:
mean(testA)
Out[12]:
10.8
  • 中央値
In [13]:
median(testA)
Out[13]:
10
  • 最頻値
In [14]:
table(testA)
Out[14]:
testA
 8 10 13 15 
 2  1  1  1 
In [15]:
str(table(testA))
 'table' int [1:4(1d)] 2 1 1 1
 - attr(*, "dimnames")=List of 1
  ..$ testA: chr [1:4] "8" "10" "13" "15"
In [16]:
which.max(table(testA))
Out[16]:
8: 1
  • 分散,標準偏差
  • 不偏分散
In [17]:
var(testA)
Out[17]:
9.7
  • これは標本分散
In [18]:
my_var <- function(x){
    m <- mean(x)
    dsq <- (x - m)^2
    return(sum(dsq) / length(x))
}
my_var(testA)
Out[18]:
7.76
In [19]:
var(testA) * (length(testA) - 1) / length(testA)
Out[19]:
7.76
In [20]:
my_var(testA) *  length(testA) / (length(testA) - 1)
Out[20]:
9.7
In [21]:
sd(testA)
Out[21]:
3.11448230047949
In [22]:
sqrt(my_var(testA))
Out[22]:
2.78567765543682
In [23]:
sqrt(my_var(testA) *  length(testA) / (length(testA) - 1))
Out[23]:
3.11448230047949
  • 平均偏差
In [24]:
library(pipeR)
Warning message:
: package ‘pipeR’ was built under R version 3.2.4
In [25]:
(testA - mean(testA)) %>>% abs %>>% mean
Out[25]:
2.56
  • 範囲
In [26]:
max(testA) - min(testA)
Out[26]:
7

2.10 標準化

In [27]:
psychology
Out[27]:
  1. 13
  2. 14
  3. 7
  4. 12
  5. 10
  6. 6
  7. 8
  8. 15
  9. 4
  10. 14
  11. 9
  12. 6
  13. 10
  14. 12
  15. 5
  16. 12
  17. 8
  18. 8
  19. 12
  20. 15
In [28]:
psy_mean <- mean(psychology)
psy_mean
Out[28]:
10
In [29]:
psy_sd <- (psychology - psy_mean)^2 %>>% mean %>>% sqrt
psy_sd
Out[29]:
3.33166624979154
In [30]:
z_score <- (psychology - psy_mean) / psy_sd
z_score
Out[30]:
  1. 0.900450337781496
  2. 1.20060045037533
  3. -0.900450337781496
  4. 0.600300225187664
  5. 0
  6. -1.20060045037533
  7. -0.600300225187664
  8. 1.50075056296916
  9. -1.80090067556299
  10. 1.20060045037533
  11. -0.300150112593832
  12. -1.20060045037533
  13. 0
  14. 0.600300225187664
  15. -1.50075056296916
  16. 0.600300225187664
  17. -0.600300225187664
  18. -0.600300225187664
  19. 0.600300225187664
  20. 1.50075056296916
In [31]:
mean(z_score)
Out[31]:
-2.75387351811318e-18
In [32]:
(z_score - mean(z_score))^2 %>>% mean %>>% sqrt
Out[32]:
1

2.11 偏差値

In [33]:
std_score <- 10 * z_score + 50
std_score
Out[33]:
  1. 59.004503377815
  2. 62.0060045037533
  3. 40.995496622185
  4. 56.0030022518766
  5. 50
  6. 37.9939954962467
  7. 43.9969977481234
  8. 65.0075056296916
  9. 31.9909932443701
  10. 62.0060045037533
  11. 46.9984988740617
  12. 37.9939954962467
  13. 50
  14. 56.0030022518766
  15. 34.9924943703084
  16. 56.0030022518766
  17. 43.9969977481234
  18. 43.9969977481234
  19. 56.0030022518766
  20. 65.0075056296916
In [34]:
mean(std_score)
Out[34]:
50
In [35]:
(std_score - mean(std_score))^2 %>>% mean %>>% sqrt
Out[35]:
10

練習問題

In [36]:
a <- c(60, 100, 50, 40, 50, 230, 120, 240, 200, 30)
b <- c(50, 60, 40, 50, 100, 80, 30, 20, 100, 120)
In [37]:
options(repr.plot.width = 8, repr.plot.height = 4)
In [38]:
layout(matrix(c(1, 2), 1, 2, byrow = TRUE))
hist(a, ylim = c(0, 5))
hist(b, breaks = hist(a, plot=FALSE)$breaks)
In [39]:
library(tidyr)
In [40]:
data.frame(a = a, b = b) %>>% 
    tidyr::gather(key, val, a:b) %>>% 
    ggplot(aes(x = val)) + 
        geom_histogram(breaks = hist(a, plot = FALSE)$breaks) -> gp
gp + facet_wrap(~key)
In [41]:
sd_a <- (a-mean(a))^2 %>>% mean %>>% sqrt
sd_b <- (b-mean(b))^2 %>>% mean %>>% sqrt
cat("Aの平均: ", mean(a), ", 標準偏差: ", sd_a, "\n")
cat("Aの平均: ", mean(b), ", 標準偏差: ", sd_b, "\n")
Aの平均:  112 , 標準偏差:  77.82031 
Aの平均:  65 , 標準偏差:  31.70173 
  • AのZ得点
In [42]:
z_a <- (a - mean(a)) / sd_a
z_a
Out[42]:
  1. -0.668206060656455
  2. -0.154201398613028
  3. -0.796707226167312
  4. -0.925208391678169
  5. -0.796707226167312
  6. 1.51631375302811
  7. 0.102800932408685
  8. 1.64481491853897
  9. 1.13081025649554
  10. -1.05370955718903
In [43]:
mean(z_a)
Out[43]:
-4.18502038579405e-18
In [44]:
(z_a - mean(z_a))^2 %>>% mean %>>% sqrt
Out[44]:
1
  • BのZ得点
In [45]:
z_b <- (b - mean(b)) / sd_b
z_b
Out[45]:
  1. -0.473160223407384
  2. -0.157720074469128
  3. -0.78860037234564
  4. -0.473160223407384
  5. 1.1040405212839
  6. 0.473160223407384
  7. -1.1040405212839
  8. -1.41948067022215
  9. 1.1040405212839
  10. 1.73492081916041
In [46]:
mean(z_b)
Out[46]:
-1.11239142897013e-17
In [47]:
(z_b - mean(z_b))^2 %>>% mean %>>% sqrt
Out[47]:
1
In [48]:
devtools::session_info()
Session info -------------------------------------------------------------------
Packages -----------------------------------------------------------------------
Session info -------------------------------------------------------------------
Packages -----------------------------------------------------------------------
Out[48]:
 setting  value                       
 version  R version 3.2.3 (2015-12-10)
 system   x86_64, darwin13.4.0        
 ui       X11                         
 language (EN)                        
 collate  ja_JP.UTF-8                 
 tz       Asia/Tokyo                  
 date     2016-05-15                  

 package    * version date       source        
 assertthat   0.1     2013-12-06 CRAN (R 3.2.0)
 base64enc    0.1-3   2015-07-28 CRAN (R 3.2.0)
 Cairo        1.5-9   2015-09-26 CRAN (R 3.2.0)
 colorspace   1.2-6   2015-03-11 CRAN (R 3.2.0)
 DBI          0.4-1   2016-05-08 CRAN (R 3.2.5)
 devtools     1.11.1  2016-04-21 CRAN (R 3.2.5)
 digest       0.6.9   2016-01-08 CRAN (R 3.2.2)
 dplyr        0.4.3   2015-09-01 CRAN (R 3.2.0)
 evaluate     0.9     2016-04-29 CRAN (R 3.2.5)
 ggplot2    * 2.1.0   2016-03-01 CRAN (R 3.2.4)
 gtable       0.2.0   2016-02-26 CRAN (R 3.2.3)
 IRdisplay    0.3     2016-05-14 local         
 IRkernel     0.5     2016-05-14 local         
 jsonlite     0.9.20  2016-05-10 CRAN (R 3.2.5)
 labeling     0.3     2014-08-23 CRAN (R 3.2.0)
 lazyeval     0.1.10  2015-01-02 CRAN (R 3.2.0)
 magrittr     1.5     2014-11-22 CRAN (R 3.2.0)
 memoise      1.0.0   2016-01-29 CRAN (R 3.2.3)
 munsell      0.4.3   2016-02-13 CRAN (R 3.2.3)
 pipeR      * 0.6.1.3 2016-04-04 CRAN (R 3.2.4)
 plyr         1.8.3   2015-06-12 CRAN (R 3.2.0)
 R6           2.1.2   2016-01-26 CRAN (R 3.2.3)
 Rcpp         0.12.4  2016-03-26 CRAN (R 3.2.4)
 repr         0.7     2016-05-13 CRAN (R 3.2.3)
 rzmq         0.7.7   2016-05-14 local         
 scales       0.4.0   2016-02-26 CRAN (R 3.2.3)
 stringi      1.0-1   2015-10-22 CRAN (R 3.2.0)
 stringr      1.0.0   2015-04-30 CRAN (R 3.2.0)
 tidyr      * 0.4.1   2016-02-05 CRAN (R 3.2.3)
 uuid         0.1-2   2015-07-28 CRAN (R 3.2.0)
 withr        1.0.1   2016-02-04 CRAN (R 3.2.3)