40個体からそれぞれ1000サイトの遺伝子発現量情報を取得した.これらの情報を基に,40個体の属性に違いがあるかを検討したい.
実はこの40個体のうちいくつかはガンにかかっている個体である.
Data <- read.csv("ISL2nd_Ch12Ex13_cancer.csv")
dim(Data)
## [1] 1000 40
head(round(Data,3),6)
## ID_01 ID_02 ID_03 ID_04 ID_05 ID_06 ID_07 ID_08 ID_09 ID_10 ID_11
## 1 -0.962 0.442 -0.975 1.418 0.819 0.316 -0.025 -0.064 0.031 -0.350 -0.723
## 2 -0.293 -1.139 0.196 -1.281 -0.251 2.512 -0.922 0.060 -1.410 -0.657 -0.116
## 3 0.259 -0.973 0.588 -0.800 -1.820 -2.059 -0.065 1.592 -0.173 -0.121 -0.188
## 4 -1.152 -2.213 -0.862 0.631 0.952 -1.166 -0.392 1.064 -0.350 -1.489 -0.243
## 5 0.196 0.593 0.283 0.247 1.979 -0.871 -0.990 -1.032 -1.110 -0.385 1.651
## 6 0.030 -0.691 -0.403 -0.730 -0.364 1.125 -1.404 -0.806 -1.238 0.578 -0.272
## ID_12 ID_13 ID_14 ID_15 ID_16 ID_17 ID_18 ID_19 ID_20 ID_21 ID_22
## 1 -0.282 1.338 0.702 1.008 -0.465 0.639 0.287 -0.227 -0.220 -1.243 -0.109
## 2 0.826 0.346 -0.570 -0.132 0.690 -0.909 1.303 -1.673 -0.526 0.798 -0.690
## 3 -1.500 -1.229 0.856 1.250 -0.898 0.870 -0.225 0.450 0.551 0.146 0.130
## 4 -0.433 -0.039 -0.058 -1.398 -0.156 -2.736 0.776 0.614 2.019 1.081 -1.077
## 5 -1.745 -0.379 -0.680 -2.132 -0.230 0.466 -1.800 0.626 -0.098 -0.300 -0.530
## 6 2.177 1.436 -1.026 0.298 -0.556 0.205 -1.192 0.235 0.671 0.131 1.069
## ID_23 ID_24 ID_25 ID_26 ID_27 ID_28 ID_29 ID_30 ID_31 ID_32 ID_33
## 1 -1.864 -0.501 -1.325 1.063 -0.296 -0.122 0.085 0.624 -0.510 -0.217 -0.056
## 2 0.900 0.429 -0.676 -0.534 -1.733 -1.603 -1.084 0.033 1.701 0.007 0.099
## 3 1.304 -1.662 -1.630 -0.077 1.306 0.793 1.559 -0.689 -0.615 0.010 0.946
## 4 -0.243 0.513 -0.513 2.552 -2.314 -1.276 -1.229 1.434 -0.284 0.199 -0.092
## 5 -2.024 -0.511 0.046 1.268 -0.744 0.223 0.858 0.275 -0.693 -0.846 -0.177
## 6 1.231 1.134 0.556 -0.359 1.080 -0.206 -0.006 0.164 1.157 0.242 0.089
## ID_34 ID_35 ID_36 ID_37 ID_38 ID_39 ID_40
## 1 -0.484 -0.522 1.949 1.324 0.468 1.061 1.656
## 2 0.564 -0.257 -0.582 -0.170 -0.542 0.313 -1.284
## 3 -0.319 -0.118 0.621 -0.071 0.402 -0.016 -0.527
## 4 0.350 -0.299 1.514 0.671 0.011 -1.044 1.625
## 5 -0.166 1.483 -1.688 -0.141 0.201 -0.676 2.221
## 6 0.183 0.943 -0.210 0.536 -1.185 -0.423 0.624
ID <- colnames(Data)
Data <- t(Data) #転置しただけ
dim(Data)
## [1] 40 1000
MEAN_by_gene <- apply(Data, 2, mean)
SD_by_gene <- apply(Data, 2, sd)
plot(MEAN_by_gene)
plot(SD_by_gene)
各変量を平均が0,分散が1となるよう変換する(正規化).
Data <- scale(Data)
MEAN_by_gene <- apply(Data, 2, mean)
SD_by_gene <- apply(Data, 2, sd)
plot(MEAN_by_gene)
plot(SD_by_gene)
Res.hc.comp <- hclust(dist(Data), method="complete")
plot(Res.hc.comp, main="Complete Linkage", labels=ID, xlab="", ylab="", sub="", cex=0.8)
Res.hc.ave <- hclust(dist(Data), method="average")
plot(Res.hc.ave, main="Average Linkage", labels=ID, xlab="", ylab="", sub="", cex=0.8)
実はID 1~20は健康な個体,ID 21~40はガンに罹患している個体で,遺伝子発現量によって,綺麗に分類できたことが分かる.
Data_Healthy <- Data[1:20, ]
Data_Disease <- Data[21:40,]
Mean_Healthy <- apply(Data_Healthy, 2, mean)
Mean_Disease <- apply(Data_Disease, 2, mean)
plot(Mean_Healthy, col="blue", ylim=c(-1,1))
points(Mean_Disease, col="red")
SD_Healthy <- apply(Data_Healthy, 2, sd)
SD_Disease <- apply(Data_Disease, 2, sd)
plot(SD_Healthy, col="blue", ylim=c(0,2))
points(SD_Disease, col="red")
plot(density(Data_Healthy[,200]), col="blue", type="l")
points(density(Data_Disease[,200]), col="red", type="l")
plot(density(Data_Healthy[,550]), col="blue", type="l")
points(density(Data_Disease[,550]), col="red", type="l")