sample():RcppArmadilloを使ってみる

  • Cppの線形代数ライブラリArmadillo
  • sample()をcppにしてみよう。速くはならないけれど、cpp内での処理が可能になる
  • "csample_char.cpp","csample_num.cpp"という関数ファイルを作って、sourceRcpp()する(参考)
// [[Rcpp::depends(RcppArmadillo)]]

#include <RcppArmadilloExtensions/sample.h>

using namespace Rcpp ;

// [[Rcpp::export]]
CharacterVector csample_char( CharacterVector x, 
                              int size,
                              bool replace, 
                              NumericVector prob = NumericVector::create()
                              ) {
  RNGScope scope ;
  CharacterVector ret = RcppArmadillo::sample(x, size, replace, prob) ;
  return ret ;
}
#include <RcppArmadilloExtensions/sample.h>
// [[Rcpp::depends(RcppArmadillo)]]

using namespace Rcpp ;

// [[Rcpp::export]]
NumericVector csample_num( NumericVector x,
                           int size,
                           bool replace,
                           NumericVector prob = NumericVector::create()
                           ) {
  RNGScope scope;
  NumericVector ret = RcppArmadillo::sample(x, size, replace, prob);
  return ret;
}
sourceCpp("csample_char.cpp")
sourceCpp("csample_num.cpp")
  • 使い方
N <- 10
set.seed(7)

sample.r <- sample(letters, N, replace=T)

set.seed(7)
sample.c <- csample_char(letters, N, replace=T)

print(identical(sample.r, sample.c))
  • Rのsample()との比較
library(rbenchmark)
Loading required package: rbenchmark
set.seed(7)

## Definition of Sampling Frame
n.elem <- 1e2
frame1 <- rnorm(n.elem)
probs1 <- runif(n.elem)

## Definition of sampling regime
## Use replacement throughout
.replace <- TRUE
## Samplesize
n.samples1 <- 1e4

## Without probabilities
benchmark(r = sample(frame1, n.samples1, replace=.replace),        
          cpp = csample_num(frame1, n.samples1, replace=.replace),
          replications = 1e3,
          order = 'relative',
          columns = c("test", "replications", "relative", "elapsed")
          )

繰り返し距離:RcppArmadilloを使ってみる

  • こちらのマハラノビス距離の繰り返し計算でスピードアップの様子を見てみよう
  • cxxfunction()関数を使っている例をsourceCpp()用に書き換えてみる
    • データ型指定はRcpp namespace形式らしい
## Using RcppArmadillo
code <- 'arma::mat x = Rcpp::as<arma::mat>(X);
arma::mat mu = Rcpp::as<arma::mat>(Mu);
arma::mat sigma = Rcpp::as<arma::mat>(Sigma);
int n = x.n_rows;
arma::vec md(n);
for (int i=0; i<n; i++){
arma::mat x_i = x.row(i) - mu;
arma::mat Y = arma::solve( sigma, arma::trans(x_i) );
md(i) = arma::as_scalar(x_i * Y);
}
return wrap(md);'
mahalanobis_RcppArma <- cxxfunction( signature(X="numeric",
Mu="numeric", Sigma="numeric"), code, plugin="RcppArmadillo")
#include <RcppArmadilloExtensions/sample.h>
// [[Rcpp::depends(RcppArmadillo)]]

using namespace Rcpp ;

// [[Rcpp::export]]

NumericVector mahalanobis_RcppArma(NumericMatrix X,NumericMatrix Mu, NumericMatrix Sigma){
	arma::mat x = Rcpp::as<arma::mat>(X);
	arma::mat mu = Rcpp::as<arma::mat>(Mu);
	arma::mat sigma = Rcpp::as<arma::mat>(Sigma);
	int n = x.n_rows;
	arma::vec md(n);
	for (int i=0; i<n; i++){
		arma::mat x_i = x.row(i) - mu;
		arma::mat Y = arma::solve( sigma, arma::trans(x_i) );
		md(i) = arma::as_scalar(x_i * Y);
	}
	return wrap(md);
}
## Using R
mahalanobis_R <- function(X, Mu, Sigma){
md <- rep(NA, nrow(X) )
for(i in 1:nrow(X) )
md[i] <- mahalanobis(X[i,], center=Mu, cov=Sigma)
return(md)
}
X <- matrix(runif(10000),100,100)
Mu <- matrix(runif(100),1,100)
Sigma <- matrix(runif(10000),100,100)

benchmark(
mahalanobis_R(X, c(Mu), Sigma),
mahalanobis_RcppArma(X, Mu, Sigma),
columns=c("test", "replications","elapsed", "relative"),
replications=100,
order="relative")
                                test replications elapsed relative
2 mahalanobis_RcppArma(X, Mu, Sigma)          100   10.11    1.000
1     mahalanobis_R(X, c(Mu), Sigma)          100   49.02    4.849

fastLmPure():RcppArmadilloを使ってみる

  • 線形回帰
data(trees, package="datasets")

fastLmPure( cbind(1, log(trees$Girth)), log(trees$Volume) )
fastLm( log(Volume) ~ log(Girth), data=trees)
lm(log(trees$Volume) ~ log(trees$Girth))  
> fastLmPure( cbind(1, log(trees$Girth)), log(trees$Volume) )
$coefficients
          [,1]
[1,] -2.353325
[2,]  2.199970

$stderr
           [,1]
[1,] 0.23066284
[2,] 0.08983455

$df.residual
[1] 29

> fastLm( log(Volume) ~ log(Girth), data=trees)

Call:
fastLm.formula(formula = log(Volume) ~ log(Girth), data = trees)

Coefficients:
(Intercept)  log(Girth) 
    -2.3533      2.2000 
> lm(log(trees$Volume) ~ log(trees$Girth))  

Call:
lm(formula = log(trees$Volume) ~ log(trees$Girth))

Coefficients:
     (Intercept)  log(trees$Girth)  
          -2.353             2.200 
library(rbenchmark)
microbenchmark(
  fastLmPure( cbind(1, log(trees$Girth)), log(trees$Volume) ),
  fastLm( log(Volume) ~ log(Girth), data=trees),
  lm(log(trees$Volume) ~ log(trees$Girth))
)
> microbenchmark(
+   fastLmPure( cbind(1, log(trees$Girth)), log(trees$Volume) ),
+   fastLm( log(Volume) ~ log(Girth), data=trees),
+   lm(log(trees$Volume) ~ log(trees$Girth))
+ )
Unit: microseconds
                                                      expr      min       lq    median
 fastLmPure(cbind(1, log(trees$Girth)), log(trees$Volume))  175.205  196.281  204.7115
            fastLm(log(Volume) ~ log(Girth), data = trees) 2813.534 2864.849 2920.3795
                  lm(log(trees$Volume) ~ log(trees$Girth)) 3405.123 3476.415 3531.3955
       uq      max neval
  221.205 2570.887   100
 3010.730 5548.262   100
 3657.118 6456.540   100