sample():RcppArmadilloを使ってみる
- Cppの線形代数ライブラリArmadillo
- sample()をcppにしてみよう。速くはならないけれど、cpp内での処理が可能になる
- "csample_char.cpp","csample_num.cpp"という関数ファイルを作って、sourceRcpp()する(参考)
// [[Rcpp::depends(RcppArmadillo)]] #include <RcppArmadilloExtensions/sample.h> using namespace Rcpp ; // [[Rcpp::export]] CharacterVector csample_char( CharacterVector x, int size, bool replace, NumericVector prob = NumericVector::create() ) { RNGScope scope ; CharacterVector ret = RcppArmadillo::sample(x, size, replace, prob) ; return ret ; }
#include <RcppArmadilloExtensions/sample.h> // [[Rcpp::depends(RcppArmadillo)]] using namespace Rcpp ; // [[Rcpp::export]] NumericVector csample_num( NumericVector x, int size, bool replace, NumericVector prob = NumericVector::create() ) { RNGScope scope; NumericVector ret = RcppArmadillo::sample(x, size, replace, prob); return ret; }
sourceCpp("csample_char.cpp") sourceCpp("csample_num.cpp")
- 使い方
N <- 10 set.seed(7) sample.r <- sample(letters, N, replace=T) set.seed(7) sample.c <- csample_char(letters, N, replace=T) print(identical(sample.r, sample.c))
- Rのsample()との比較
library(rbenchmark) Loading required package: rbenchmark set.seed(7) ## Definition of Sampling Frame n.elem <- 1e2 frame1 <- rnorm(n.elem) probs1 <- runif(n.elem) ## Definition of sampling regime ## Use replacement throughout .replace <- TRUE ## Samplesize n.samples1 <- 1e4 ## Without probabilities benchmark(r = sample(frame1, n.samples1, replace=.replace), cpp = csample_num(frame1, n.samples1, replace=.replace), replications = 1e3, order = 'relative', columns = c("test", "replications", "relative", "elapsed") )
繰り返し距離:RcppArmadilloを使ってみる
- こちらのマハラノビス距離の繰り返し計算でスピードアップの様子を見てみよう
- cxxfunction()関数を使っている例をsourceCpp()用に書き換えてみる
- データ型指定はRcpp namespace形式らしい
## Using RcppArmadillo code <- 'arma::mat x = Rcpp::as<arma::mat>(X); arma::mat mu = Rcpp::as<arma::mat>(Mu); arma::mat sigma = Rcpp::as<arma::mat>(Sigma); int n = x.n_rows; arma::vec md(n); for (int i=0; i<n; i++){ arma::mat x_i = x.row(i) - mu; arma::mat Y = arma::solve( sigma, arma::trans(x_i) ); md(i) = arma::as_scalar(x_i * Y); } return wrap(md);' mahalanobis_RcppArma <- cxxfunction( signature(X="numeric", Mu="numeric", Sigma="numeric"), code, plugin="RcppArmadillo")
#include <RcppArmadilloExtensions/sample.h> // [[Rcpp::depends(RcppArmadillo)]] using namespace Rcpp ; // [[Rcpp::export]] NumericVector mahalanobis_RcppArma(NumericMatrix X,NumericMatrix Mu, NumericMatrix Sigma){ arma::mat x = Rcpp::as<arma::mat>(X); arma::mat mu = Rcpp::as<arma::mat>(Mu); arma::mat sigma = Rcpp::as<arma::mat>(Sigma); int n = x.n_rows; arma::vec md(n); for (int i=0; i<n; i++){ arma::mat x_i = x.row(i) - mu; arma::mat Y = arma::solve( sigma, arma::trans(x_i) ); md(i) = arma::as_scalar(x_i * Y); } return wrap(md); }
## Using R mahalanobis_R <- function(X, Mu, Sigma){ md <- rep(NA, nrow(X) ) for(i in 1:nrow(X) ) md[i] <- mahalanobis(X[i,], center=Mu, cov=Sigma) return(md) }
X <- matrix(runif(10000),100,100) Mu <- matrix(runif(100),1,100) Sigma <- matrix(runif(10000),100,100) benchmark( mahalanobis_R(X, c(Mu), Sigma), mahalanobis_RcppArma(X, Mu, Sigma), columns=c("test", "replications","elapsed", "relative"), replications=100, order="relative")
test replications elapsed relative 2 mahalanobis_RcppArma(X, Mu, Sigma) 100 10.11 1.000 1 mahalanobis_R(X, c(Mu), Sigma) 100 49.02 4.849
fastLmPure():RcppArmadilloを使ってみる
- 線形回帰
data(trees, package="datasets") fastLmPure( cbind(1, log(trees$Girth)), log(trees$Volume) ) fastLm( log(Volume) ~ log(Girth), data=trees) lm(log(trees$Volume) ~ log(trees$Girth))
> fastLmPure( cbind(1, log(trees$Girth)), log(trees$Volume) ) $coefficients [,1] [1,] -2.353325 [2,] 2.199970 $stderr [,1] [1,] 0.23066284 [2,] 0.08983455 $df.residual [1] 29 > fastLm( log(Volume) ~ log(Girth), data=trees) Call: fastLm.formula(formula = log(Volume) ~ log(Girth), data = trees) Coefficients: (Intercept) log(Girth) -2.3533 2.2000 > lm(log(trees$Volume) ~ log(trees$Girth)) Call: lm(formula = log(trees$Volume) ~ log(trees$Girth)) Coefficients: (Intercept) log(trees$Girth) -2.353 2.200
- 速さのベンチマーク
library(rbenchmark) microbenchmark( fastLmPure( cbind(1, log(trees$Girth)), log(trees$Volume) ), fastLm( log(Volume) ~ log(Girth), data=trees), lm(log(trees$Volume) ~ log(trees$Girth)) )
> microbenchmark( + fastLmPure( cbind(1, log(trees$Girth)), log(trees$Volume) ), + fastLm( log(Volume) ~ log(Girth), data=trees), + lm(log(trees$Volume) ~ log(trees$Girth)) + ) Unit: microseconds expr min lq median fastLmPure(cbind(1, log(trees$Girth)), log(trees$Volume)) 175.205 196.281 204.7115 fastLm(log(Volume) ~ log(Girth), data = trees) 2813.534 2864.849 2920.3795 lm(log(trees$Volume) ~ log(trees$Girth)) 3405.123 3476.415 3531.3955 uq max neval 221.205 2570.887 100 3010.730 5548.262 100 3657.118 6456.540 100