決定的MDS - ryamadaのコンピュータ・数学メモ

こちらで、木グラフ上の点の点間距離からMDSをすることを書いた
木であるということを利用してMDSするとどうなるだろうか
- 木であるということは、ノードからのエッジの生え方に偏りを与える理由がないということ
- 偏りがないということは、平等に生やすのがよいということ
- 次数kのノードはk個の頂点を持つk-1正単体の頂点座標に向かって生やすのがよいということ
それを利用して、「木のための決定的MDS」をやってみよう
これでやると、「必要かつ十分」な次元はノードの次数情報から、確実に求まるのもメリット
- 次数１、２のノードは全体の表示次元に影響しない(グラフなので、１次元は「絶対不可欠である」という前提による)
- 次数が３以上のノードは、少なくとも、そのノードの周囲局所について、次数-１の次元が必要
- 次数が２以上のノードが隣接していて、エッジを１本共有しているときには、２つのノードの次数ー１という値の和から１小さい次元が必要
- 結局、次のような関数で次元は出せる

# dgはノードの次数
dim.tree.dg <- function(dg){
	sum(dg[which(dg >=2)]) -length(which(dg >=2))*2 +1
}

そんなこんなで、木のための決め打ちMDS座標を作ってみよう

library(igraph)
library(vegan)
library(rgl)
dim.tree.dg <- function(dg){
	sum(dg[which(dg >=2)]) -length(which(dg >=2))*2 +1
}
make.mst.graph <- function(X){
	mst.x <- spantree(dist(X))
	e.x <- cbind(2:(length(mst.x$kid)+1),mst.x$kid)
	#e.x <- cbind(1:(length(mst.x$kid)),mst.x$kid-1)
	#graph.edgelist(e.x)
	return(list(g = graph.edgelist(e.x),mst=mst.x))
	#return(list(g = add.edges(graph.empty(length(X[,1])), e.x, weight=mst.x$dist),mst=mst.x))
}
CategoryVector<-function (d = 3) 
{
    if(d == 1){
     return(matrix(1,ncol=1,nrow=1))
    }
    df <- d - 1
    diagval <- 1:d
    diagval <- sqrt((df + 1)/df) * sqrt((df - diagval + 1)/(df - 
        diagval + 2))
    others <- -diagval/(df - (0:(d - 1)))
    m <- matrix(rep(others, df + 1), nrow = df + 1, byrow = TRUE)
    diag(m) <- diagval
    m[upper.tri(m)] <- 0
    as.matrix(m[, 1:df])
}
Simplex.Vector <- function(n){
	if(n==1){
		return(matrix(1,ncol=1,nrow=1))
	}
	X <- CategoryVector(n)
	X <- cbind(rep(1/sqrt(n),n),sqrt((n-1)/n)*X)
	X
}

Cartesian2AngularX<-function(x){
	n<-length(x)
	if(n==1){
		return(list(r=abs(x),t=acos(sign(x))))
	}else if(n==2){
		tmpt<-0
		if(x[1]!=0){
			tmpt<-acos(x[1]/sqrt(sum(x^2)))*sign(x[2])
			if(sign(x[2])==0){
				if(x[1]>0){
					tmpt<-0
				}else if(x[1]<0){
					tmpt<-pi
				}
			}
			#tmpt<-atan(x[2]/x[1])
		}else{
			if(x[2]>0){
				tmpt<-pi/2
			}else if(x[2]<0){
				tmpt<--pi/2
			}
		}
		return(list(r=sqrt(sum(x^2)),t=tmpt))
	}
	r<-sqrt(sum(x^2))
	xst<-x/r
	#n<-length(xst)
	t<-rep(0,n-1)
	S<-C<-t
	S[n-1]<-xst[n]
	if(S[n-1]>1)S[n-1]<-1
	if(S[n-1]<(-1))S[n-1]<-(-1)
	t[n-1]<-asin(S[n-1])
	C[n-1]<-cos(t[n-1])
	cumC<-C[n-1]
	for(i in (n-2):1){
		if(cumC!=0){
			S[i]<-xst[i+1]/cumC
			if(S[i]>1)S[i]<-1
			if(S[i]<(-1))S[i]<-(-1)
			t[i]<-asin(S[i])
			C[i]<-cos(t[i])
			cumC<-cumC*C[i]
		}else{
			S[i]<-0
			t[i]<-asin(S[i])
			C[i]<-cos(t[i])
		}
		
	}
	list(r=r,t=t)
}
# (1,0,...,0)ベクトルを与えたベクトルに移す回転行列(もしくはその逆行列)を返す

VectorRotation<-function(P,inv=FALSE){ # inv=TRUEは逆行列
	n<-length(P)
	#print(n)
	if(n==1){
		return(matrix(sign(P)*c(1),1,1))
	}
	Q<-Cartesian2AngularX(P)

	M<-diag(n)
	M2<-M

	for(i in 1:length(Q[[2]])){
		tmp<-M
		tmp[i,i]<--sin(Q[[2]][i])
		tmp[i,i+1]<-cos(Q[[2]][i])
		tmp[i+1,i]<-cos(Q[[2]][i])
		tmp[i+1,i+1]<-sin(Q[[2]][i])
		M2<-tmp%*%M2
	}
	M2<-t(M2)[,n:1]
	
	S <- rep(1,length(P))
	S2 <- sign(M2[,1]*P)
	S[which(S2<0)] <- -1
	#S<-sign(M2[,1]*P)
	M2<-M2*S
	if(inv){
		#M2<-solve(M2)
		M2<-t(M2)
	}
	M2
}

# gはigraphパッケージのグラフオブジェクト
coords.tree <- function(g,weight=NULL){
	dg <- igraph::degree(g)
	if(is.null(weight)){
		weight <- rep(1,length(dg))
	}
	# ノードIDは1から付番していることを要求するので、そのチェックを修正
	zero.start <- FALSE

	if(dg[1] == 0){
		zero.start <- TRUE
		dg <- dg[-1]
	}
	# ノードとそこから生えるエッジは、ノードの次数より一つ低い次元に納められる
	# 木IDに対して、空間の次元を与える
	# この情報も、後の処理の過程で更新されていく
	dg.tree <- dg-1
	dg.tree[which(dg.tree<1)] <- 1

	# エッジに関する処理
	ed.list <- get.edgelist(g)
	num.edge <- length(ed.list[,1])
	# 各ノードのエッジをエッジID(0から始まる)で付与
	tmp.ed.list.per.node <- get.adjedgelist(g)
	ed.list.per.node <- list()
	if(zero.start){
		for(i in 2:length(tmp.ed.list.per.node)){
			ed.list.per.node[[i-1]] <- tmp.ed.list.per.node[[i]] +1
		}
	}else{
		ed.list.per.node <- tmp.ed.list.per.node
	}

	# 処理
	# 初期状態はノード数の木からなる林
	# 初期状態の木は、ノードが１つで、そこから次数分のエッジが生えていて、エッジの端点は開放されている
	# 次数分のエッジの方向は、正単体の頂点方向とする
	# エッジを１本ずつつないで行って、最後に１本の木にする
	# エッジをつなぐにあたって、ノード周囲のエッジに制約が入るので、
	# その制約を成長させていく
	
	# 木が収まるべき次元
	d <- dim.tree.dg(dg)

	# エッジの向きは正単体頂点座標ベクトルを使って算出する
	max.simplex <- max(dg)

	Simplex.list <- list()
	for(i in 1:max.simplex){
		Simplex.list[[i]] <- CategoryVector(i) 
	}
	# 個々のノードが帰属する木のIDを格納したベクトル
	tr.n <- 1:length(dg)
	# エッジも木に帰属している
	# ただし、帰属先の木が２個から始まり、だんだん１つの木に属するようになる
	tr.e1 <- ed.list[,1]
	tr.e2 <- ed.list[,2]


	e.mat1 <- e.mat2 <- matrix(0,num.edge,d)
	e.v.len1 <- e.v.len2 <- rep(0,num.edge)

	# 初期状態では、エッジの方向ベクトルは低次元に納まる
	# それを左詰めで格納しておく
	# エッジの向きを考慮して納める
	for(i in 1:num.edge){
		n1 <- tr.e1[i]
		n2 <- tr.e2[i]
		tmpv1 <- Simplex.list[[dg[n1]]][which(ed.list.per.node[[n1]] == i),]
		tmpv2 <- -Simplex.list[[dg[n2]]][which(ed.list.per.node[[n2]] == i),]
		e.v.len1[i] <- length(tmpv1)
		e.v.len2[i] <- length(tmpv2)

		e.mat1[i,1:e.v.len1[i]] <- tmpv1
		e.mat2[i,1:e.v.len2[i]] <- tmpv2
	}

	# エッジを１本ずつ処理
	for(i in 1:num.edge){
		# 処理対象エッジの２ノードのIDを取り出す
		node1 <- ed.list[i,1]
		node2 <- ed.list[i,2]
		# 処理対象エッジが結びつく２ノードが帰属する木のIDを取り出す
		tr1 <- tr.n[node1]
		tr2 <- tr.n[node2]
		# 帰属木が異なるときに連結する処理をする
		if(tr1 != tr2){
			# 対象となる２つの木に属するエッジIDをe.mat1, e.mat2の帰属別に取り出す
			e1.1 <- which(tr.e1 == tr1)
			e1.2 <- which(tr.e2 == tr1)
			e2.1 <- which(tr.e1 == tr2)
			e2.2 <- which(tr.e2 == tr2)

			# 現時点でのエッジ方向座標を木１、木２ごとに取り出す
			tmp1.1 <- (e.mat1[e1.1,1:dg.tree[tr1]])
			tmp1.2 <- (e.mat2[e1.2,1:dg.tree[tr1]])
			
			tmp2.1 <- (e.mat1[e2.1,1:dg.tree[tr2]])
			tmp2.2 <- (e.mat2[e2.2,1:dg.tree[tr2]])
			# 行列ハンドリングをするための措置
			if(is.vector(tmp1.1)){
				tmp1.1 <- matrix(tmp1.1,byrow=TRUE,ncol=dg.tree[tr1])
			}
			if(is.vector(tmp1.2)){
				tmp1.2 <- matrix(tmp1.2,byrow=TRUE,ncol=dg.tree[tr1])
			}
			if(is.vector(tmp2.1)){
				tmp2.1 <- matrix(tmp2.1,byrow=TRUE,ncol=dg.tree[tr2])
			}
			if(is.vector(tmp2.2)){
				tmp2.2 <- matrix(tmp2.2,byrow=TRUE,ncol=dg.tree[tr2])
			}
			
			dup1 <- duplicated(c(e1.1,e1.2))
			dup2 <- duplicated(c(e2.1,e2.2))
			
			tmp1 <- rbind(tmp1.1,tmp1.2)
			tmp2 <- rbind(tmp2.1,tmp2.2)
			
			# 行列ハンドリングをするための措置
			if(is.vector(tmp1)){
				tmp1 <- matrix(tmp1,nrow=length(e1),byrow=TRUE)
			}
			if(is.vector(tmp2)){
				tmp2 <- matrix(tmp2,nrow=length(e2),byrow=TRUE)
			}
			
			# 木１、木２に属するエッジIDをまとめる
			e1 <- unique(c(e1.1,e1.2))
			e2 <- unique(c(e2.1,e2.2))
			# 木１、木２に共有されるエッジIDはただ一つ存在する
			dup.e <- c(e1,e2)[which(duplicated(c(e1,e2)))]

			# 処理対象エッジに相当するエッジの方向ベクトルが、木１、木２で同じになるように回転させる
			# その回転ベクトルを作る
			rot1 <- VectorRotation(tmp1[which(c(e1.1,e1.2) == dup.e)[1],],inv = TRUE)
			rot2 <- VectorRotation(tmp2[which(c(e2.1,e2.2) == dup.e)[1],],inv = TRUE)
			# エッジの方向ベクトルを木ごとに回転する
			rot.tmp1 <- t(rot1 %*% t(tmp1))
			rot.tmp2 <- t(rot2 %*% t(tmp2))
			dim1 <- dim(tmp1)
			dim2 <- dim(tmp2)

			# 回転して、一致させるエッジは、c(1,0,...)とc(-1,0,...)と言うように逆向きにする
			m12 <- matrix(0,dim1[1]+dim2[1],dim1[2]+dim2[2])
			dim12 <- dim(m12)
			m12[1:dim1[1],1:dim1[2]] <- rot.tmp1
			# 逆向きにするために、木２の方のエッジ方向ベクトルは反転する
			m12[(dim1[1]+1):dim12[1],(dim1[2]+1):dim12[2]] <- -rot.tmp2
			# ただし、一致させるエッジだけは、反転させない
			m12[(dim1[1]+1):dim12[1],1] <- rot.tmp2[,1]
			m12 <- m12[,(1:dim12[2])[-(dim1[2]+1)]]
			if(is.vector(m12)){
				m12 <- matrix(m12,ncol=1)
			}
			
			new.dim12 <- dim(m12)
			
			# e.mat1,e.mat2行列に格納してあったエッジ方向ベクトルを更新する
			# 更新されたベクトルの長さは(不変のこともあるが)長くなっていく
			e.mat1[e1.1,1:new.dim12[2]] <- m12[1:length(e1.1),1:new.dim12[2]]
			e.mat2[e1.2,1:new.dim12[2]] <- m12[(length(e1.1)+1):length(c(e1.1,e1.2)),1:new.dim12[2]]
			e.mat1[e2.1,1:new.dim12[2]] <- m12[(dim1[1]+1):(dim1[1]+length(e2.1)),1:new.dim12[2]]
			e.mat2[e2.2,1:new.dim12[2]] <- m12[(dim1[1]+length(e2.1)+1):new.dim12[1],1:new.dim12[2]]
			
			# 帰属する木の情報も更新する
			tr.n[which(tr.n==tr2)] <- tr1
			tr.e1[which(tr.e1 == tr2)] <- tr1
			tr.e2[which(tr.e2 == tr2)] <- tr1
			new.dg <- dg.tree[tr1] + dg.tree[tr2] -1
			dg.tree[tr1] <- dg.tree[tr2] <- new.dg

		}
	}
	# エッジの向き！
	X <- X.w <- matrix(0,length(tr.n),d)
	both.ed.list <- rbind(ed.list,ed.list[,c(2,1)])
	both.ed.vector <- rbind(e.mat1,-e.mat1)
	weight2 <- rep(weight,2)
	paths <- get.shortest.paths(graph.edgelist(ed.list,directed=FALSE),1)
	st.i <- 2
	if(zero.start){
		st.i <- 3
	}
	for(i in st.i:length(paths)){
		tmp <- tmp.w <- rep(0,d)
		for(j in 1:(length(paths[[i]])-1)){
			id <- which(both.ed.list[,1] == paths[[i]][j] & both.ed.list[,2] == paths[[i]][j+1])
			tmp <- tmp + both.ed.vector[id,]
			tmp.w <- tmp.w + both.ed.vector[id,]*weight2[id]
		}
		
		X[i-1,] <- tmp
		X.w[i-1,] <- tmp.w
	}
	return(list(e.vectors = e.mat1,dim =d,g=g,X=X,X.w = X.w))
}
df <- 5

k <- 3

n.pts <- rep(4,k)

X <- NULL

for(i in 1:k){
	s <- sample(1:df,n.pts[i],replace=TRUE)
	tmp <- matrix(0, n.pts[i],df)
	for(j in 1:length(s)){
		tmp[j,s[j]]<-sample(c(-1,1),1)
	}
	X <- rbind(X,apply(tmp,2,cumsum))
}

df <- 5

k <- 3

n.pts <- rep(4,k)

X <- NULL

for(i in 1:k){
	s <- sample(1:df,n.pts[i],replace=TRUE)
	tmp <- matrix(0, n.pts[i],df)
	for(j in 1:length(s)){
		tmp[j,s[j]]<-sample(c(-1,1),1)
	}
	X <- rbind(X,apply(tmp,2,cumsum))
}

plot3d(X)
M <- jitter(X,10)

# 雲の重心からの距離に応じて、0,1のフェノタイプを与える
center <- apply(M,2,mean)
d.from.center <- apply((t(M)-center)^2,2,sum)

phenotype <- rep(0,length(M[,1]))

phenotype[which(d.from.center>mean(d.from.center))] <- 1

# フェノタイプで色分けして３次元分だけプロット
plot3d(M[,1:3],col = phenotype +1)

# g は最小全域木のグラフオブジェクト
g <- make.mst.graph(M)
# このグラフのペアワイズ点間距離行列を作る

cds.tr.out <- coords.tree(g$g)
# グラフが納まっている次元
cds.tr.out$dim

# 平面にプロットするなら、グラフ視覚化手法でやるのがよいから
plot(g$mst)

# グラフの最小次元配置
# エッジの長さを無視した、トポロジー的な座標
cds.tr.out$X
# エッジの長さを考慮した、座標
cds.tr.out$X.w

par(ask = TRUE)
for(i in 1:(cds.tr.out$dim-1)){
	for(j in (i+1):cds.tr.out$dim){
		plot(cds.tr.out$g,layout = cds.tr.out$X.w[,c(i,j)])

	}
}