Fix/export (#21)

* Added autoscale formula * Fixing resource files example
Azure · May 25, 2017 · 8c35a36 · 8c35a36
1 parent a4b3826
commit 8c35a36
Show file tree

Hide file tree

Showing 3 changed files with 33 additions and 17 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -6,6 +6,7 @@ export(getJobList)
 export(getJobResult)
 export(makeCluster)
 export(registerDoAzureParallel)
+export(resizeCluster)
 export(setChunkSize)
 export(setCredentials)
 export(setVerbose)

diff --git a/R/autoscale.R b/R/autoscale.R
@@ -64,6 +64,7 @@ getAutoscaleFormula <- function(formulaName, dedicatedMin, dedicatedMax, lowPrio
 #'
 #' @examples
 #' resizeCluster(cluster, dedicatedMin = 2, dedicatedMax = 6, dedicatedMin = 2, dedicatedMax = 6, algorithm = "QUEUE", timeInterval = "PT10M")
+#' @export
 resizeCluster <- function(cluster,
                           dedicatedMin,
                           dedicatedMax,

diff --git a/samples/resource_files_example.R b/samples/resource_files_example.R
@@ -16,27 +16,42 @@ generateCredentialsConfig("credentials.json")
 # set azure credentials
 setCredentials("credentials.json")
 
+# create credentials config files
+generateClusterConfig("cluster_settings.json")
+
+# Add data.table package to the CRAN packages and Azure/rAzureBatch to the Github packages
+# in order to install the packages to all of the nodes
+# Since reading the large datasets cost high memory, we recommend using Standard_D11_v2
+# "rPackages": {
+#   "cran": ["data.table"],
+#   "github": ["Azure/rAzureBatch", "Azure/doAzureParallel"]
+# }
+
 # ===================================================
 # === Setting up your cluster with resource files ===
 # ===================================================
 
 # Now we will use resource-files to upload our dataset onto each node of our cluster.
-# Currently, our data is stored in Azure Blob in an account called 'playdatastore', 
-#   in a public container called "nyc-taxi-dataset". To get this dataset onto each node, 
+# Currently, our data is stored in Azure Blob in an account called 'playdatastore',
+#   in a public container called "nyc-taxi-dataset". To get this dataset onto each node,
 #   we will create a resouceFile object for each blob - we will then use the resourceFile
 #   when building the cluster so that each node in the cluster knows to download these files
 #   after the node is provisioned.
 # Using the NYC taxi datasets, http://www.nyc.gov/html/tlc/html/about/trip_record_data.shtml
 azureStorageUrl <- "http://playdatastore.blob.core.windows.net/nyc-taxi-dataset"
 resource_files <- list(
-  createResourceFile(url = paste0(azureStorageUrl, "/yellow_tripdata_2016-01.csv"), fileName = "yellow_tripdata_2016-01.csv"),
-  createResourceFile(url = paste0(azureStorageUrl, "/yellow_tripdata_2016-02.csv"), fileName = "yellow_tripdata_2016-02.csv"),
-  createResourceFile(url = paste0(azureStorageUrl, "/yellow_tripdata_2016-03.csv"), fileName = "yellow_tripdata_2016-03.csv"),
-  createResourceFile(url = paste0(azureStorageUrl, "/yellow_tripdata_2016-04.csv"), fileName = "yellow_tripdata_2016-04.csv"),
-  createResourceFile(url = paste0(azureStorageUrl, "/yellow_tripdata_2016-05.csv"), fileName = "yellow_tripdata_2016-05.csv"),
-  createResourceFile(url = paste0(azureStorageUrl, "/yellow_tripdata_2016-06.csv"), fileName = "yellow_tripdata_2016-06.csv"),
-  createResourceFile(url = paste0(azureStorageUrl, "/yellow_tripdata_2016-07.csv"), fileName = "yellow_tripdata_2016-07.csv"),
-  createResourceFile(url = paste0(azureStorageUrl, "/yellow_tripdata_2016-08.csv"), fileName = "yellow_tripdata_2016-08.csv")
+  createResourceFile(url = paste0(azureStorageUrl, "/yellow_tripdata_2016-1.csv"), fileName = "yellow_tripdata_2016-1.csv"),
+  createResourceFile(url = paste0(azureStorageUrl, "/yellow_tripdata_2016-2.csv"), fileName = "yellow_tripdata_2016-2.csv"),
+  createResourceFile(url = paste0(azureStorageUrl, "/yellow_tripdata_2016-3.csv"), fileName = "yellow_tripdata_2016-3.csv"),
+  createResourceFile(url = paste0(azureStorageUrl, "/yellow_tripdata_2016-4.csv"), fileName = "yellow_tripdata_2016-4.csv"),
+  createResourceFile(url = paste0(azureStorageUrl, "/yellow_tripdata_2016-5.csv"), fileName = "yellow_tripdata_2016-5.csv"),
+  createResourceFile(url = paste0(azureStorageUrl, "/yellow_tripdata_2016-6.csv"), fileName = "yellow_tripdata_2016-6.csv"),
+  createResourceFile(url = paste0(azureStorageUrl, "/yellow_tripdata_2016-7.csv"), fileName = "yellow_tripdata_2016-7.csv"),
+  createResourceFile(url = paste0(azureStorageUrl, "/yellow_tripdata_2016-8.csv"), fileName = "yellow_tripdata_2016-8.csv"),
+  createResourceFile(url = paste0(azureStorageUrl, "/yellow_tripdata_2016-9.csv"), fileName = "yellow_tripdata_2016-9.csv"),
+  createResourceFile(url = paste0(azureStorageUrl, "/yellow_tripdata_2016-10.csv"), fileName = "yellow_tripdata_2016-10.csv"),
+  createResourceFile(url = paste0(azureStorageUrl, "/yellow_tripdata_2016-11.csv"), fileName = "yellow_tripdata_2016-11.csv"),
+  createResourceFile(url = paste0(azureStorageUrl, "/yellow_tripdata_2016-12.csv"), fileName = "yellow_tripdata_2016-12.csv")
 )
 
 # add the parameter 'resourceFiles' to download files to nodes
@@ -45,20 +60,19 @@ cluster <- makeCluster("cluster_settings.json", resourceFiles = resource_files)
 # when the cluster is provisioned, register the cluster as your parallel backend
 registerDoAzureParallel(cluster)
 
-
 # ======================================================
 # === Setting up storage account to write results to ===
 # ======================================================
 
 # Setup storage location to write your results to:
 # This step will allow your to upload your results from within your doAzureParallel foreach loop:
-# 
+#
 #   1. Replace the "mystorageaccount" with the name of the storage account you wish to write your results to.
 #   2. Create an output container named "nyc-taxi-graphs" to store your results in
 #   3. Create a SasToken that allows us to write ("w") to the container
-#   4. Notice the parameter 'path = "c"' in the createSasToken method, this 
+#   4. Notice the parameter 'path = "c"' in the createSasToken method, this
 #      simply means that the token is created for that entire container in storage
-# 
+#
 storageAccountName <- "mystorageaccount"
 outputsContainer <- "nyc-taxi-graphs"
 createContainer(outputsContainer)
@@ -68,8 +82,8 @@ outputSas <- createSasToken(permission = "w", path = "c", outputsContainer)
 # === Foreach with resourceFiles & writing to storage ===
 # =======================================================
 
-results <- foreach(i = 1:8, .packages = c("data.table", "ggplot2", "rAzureBatch")) %dopar% {
-  
+results <- foreach(i = 1:12, .packages = c("data.table", "ggplot2", "rAzureBatch")) %dopar% {
+
   # To get access to your azure resource files, user needs to use the special
   # environment variable to get the directory
   fileDirectory <- paste0(Sys.getenv("AZ_BATCH_NODE_STARTUP_DIR"), "/wd")
@@ -78,7 +92,7 @@ results <- foreach(i = 1:8, .packages = c("data.table", "ggplot2", "rAzureBatch"
   colsToKeep <- c("pickup_longitude", "pickup_latitude", "dropoff_longitude", "dropoff_latitude", "tip_amount", "trip_distance")
 
   # read in data from CSV that was downloaded from the resource file
-  file <- fread(paste0(fileDirectory, "/yellow_tripdata_2016-0", i, ".csv"), select = colsToKeep)
+  file <- fread(paste0(fileDirectory, "/yellow_tripdata_2016-", i, ".csv"), select = colsToKeep)
 
   # set the coordinates for the bounds of the plot
   min_lat <- 40.5774