問題設定

40個体からそれぞれ1000サイトの遺伝子発現量情報を取得した.これらの情報を基に,40個体の属性に違いがあるかを検討したい.

実はこの40個体のうちいくつかはガンにかかっている個体である.

Data <- read.csv("ISL2nd_Ch12Ex13_cancer.csv")
dim(Data)
## [1] 1000   40
head(round(Data,3),6)
##    ID_01  ID_02  ID_03  ID_04  ID_05  ID_06  ID_07  ID_08  ID_09  ID_10  ID_11
## 1 -0.962  0.442 -0.975  1.418  0.819  0.316 -0.025 -0.064  0.031 -0.350 -0.723
## 2 -0.293 -1.139  0.196 -1.281 -0.251  2.512 -0.922  0.060 -1.410 -0.657 -0.116
## 3  0.259 -0.973  0.588 -0.800 -1.820 -2.059 -0.065  1.592 -0.173 -0.121 -0.188
## 4 -1.152 -2.213 -0.862  0.631  0.952 -1.166 -0.392  1.064 -0.350 -1.489 -0.243
## 5  0.196  0.593  0.283  0.247  1.979 -0.871 -0.990 -1.032 -1.110 -0.385  1.651
## 6  0.030 -0.691 -0.403 -0.730 -0.364  1.125 -1.404 -0.806 -1.238  0.578 -0.272
##    ID_12  ID_13  ID_14  ID_15  ID_16  ID_17  ID_18  ID_19  ID_20  ID_21  ID_22
## 1 -0.282  1.338  0.702  1.008 -0.465  0.639  0.287 -0.227 -0.220 -1.243 -0.109
## 2  0.826  0.346 -0.570 -0.132  0.690 -0.909  1.303 -1.673 -0.526  0.798 -0.690
## 3 -1.500 -1.229  0.856  1.250 -0.898  0.870 -0.225  0.450  0.551  0.146  0.130
## 4 -0.433 -0.039 -0.058 -1.398 -0.156 -2.736  0.776  0.614  2.019  1.081 -1.077
## 5 -1.745 -0.379 -0.680 -2.132 -0.230  0.466 -1.800  0.626 -0.098 -0.300 -0.530
## 6  2.177  1.436 -1.026  0.298 -0.556  0.205 -1.192  0.235  0.671  0.131  1.069
##    ID_23  ID_24  ID_25  ID_26  ID_27  ID_28  ID_29  ID_30  ID_31  ID_32  ID_33
## 1 -1.864 -0.501 -1.325  1.063 -0.296 -0.122  0.085  0.624 -0.510 -0.217 -0.056
## 2  0.900  0.429 -0.676 -0.534 -1.733 -1.603 -1.084  0.033  1.701  0.007  0.099
## 3  1.304 -1.662 -1.630 -0.077  1.306  0.793  1.559 -0.689 -0.615  0.010  0.946
## 4 -0.243  0.513 -0.513  2.552 -2.314 -1.276 -1.229  1.434 -0.284  0.199 -0.092
## 5 -2.024 -0.511  0.046  1.268 -0.744  0.223  0.858  0.275 -0.693 -0.846 -0.177
## 6  1.231  1.134  0.556 -0.359  1.080 -0.206 -0.006  0.164  1.157  0.242  0.089
##    ID_34  ID_35  ID_36  ID_37  ID_38  ID_39  ID_40
## 1 -0.484 -0.522  1.949  1.324  0.468  1.061  1.656
## 2  0.564 -0.257 -0.582 -0.170 -0.542  0.313 -1.284
## 3 -0.319 -0.118  0.621 -0.071  0.402 -0.016 -0.527
## 4  0.350 -0.299  1.514  0.671  0.011 -1.044  1.625
## 5 -0.166  1.483 -1.688 -0.141  0.201 -0.676  2.221
## 6  0.183  0.943 -0.210  0.536 -1.185 -0.423  0.624
ID <- colnames(Data)

データの前処理

Data <- t(Data) #転置しただけ
dim(Data)
## [1]   40 1000
MEAN_by_gene <- apply(Data, 2, mean)
SD_by_gene <- apply(Data, 2, sd)

plot(MEAN_by_gene)

plot(SD_by_gene)

各変量を平均が0,分散が1となるよう変換する(正規化).

Data <- scale(Data)
MEAN_by_gene <- apply(Data, 2, mean)
SD_by_gene <- apply(Data, 2, sd)

plot(MEAN_by_gene)

plot(SD_by_gene)

階層的クラスタリングによるクラスター化

“complete”距離を用いる場合

Res.hc.comp <- hclust(dist(Data), method="complete")
plot(Res.hc.comp, main="Complete Linkage", labels=ID, xlab="", ylab="", sub="", cex=0.8)

Res.hc.ave <- hclust(dist(Data), method="average")
plot(Res.hc.ave, main="Average Linkage", labels=ID, xlab="", ylab="", sub="", cex=0.8)

実は

実はID 1~20は健康な個体,ID 21~40はガンに罹患している個体で,遺伝子発現量によって,綺麗に分類できたことが分かる.

Data_Healthy <- Data[1:20, ]
Data_Disease <- Data[21:40,]

Mean_Healthy <- apply(Data_Healthy, 2, mean)
Mean_Disease <- apply(Data_Disease, 2, mean)

plot(Mean_Healthy, col="blue", ylim=c(-1,1))
points(Mean_Disease, col="red")

SD_Healthy <- apply(Data_Healthy, 2, sd)
SD_Disease <- apply(Data_Disease, 2, sd)

plot(SD_Healthy, col="blue", ylim=c(0,2))
points(SD_Disease, col="red")

plot(density(Data_Healthy[,200]), col="blue", type="l")
points(density(Data_Disease[,200]), col="red", type="l")

plot(density(Data_Healthy[,550]), col="blue", type="l")
points(density(Data_Disease[,550]), col="red", type="l")