Skip to content
This repository has been archived by the owner on Oct 12, 2023. It is now read-only.

Commit

Permalink
Fix/export (#21)
Browse files Browse the repository at this point in the history
* Added autoscale formula

* Fixing resource files example
  • Loading branch information
brnleehng authored May 25, 2017
1 parent a4b3826 commit 8c35a36
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 17 deletions.
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ export(getJobList)
export(getJobResult)
export(makeCluster)
export(registerDoAzureParallel)
export(resizeCluster)
export(setChunkSize)
export(setCredentials)
export(setVerbose)
Expand Down
1 change: 1 addition & 0 deletions R/autoscale.R
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ getAutoscaleFormula <- function(formulaName, dedicatedMin, dedicatedMax, lowPrio
#'
#' @examples
#' resizeCluster(cluster, dedicatedMin = 2, dedicatedMax = 6, dedicatedMin = 2, dedicatedMax = 6, algorithm = "QUEUE", timeInterval = "PT10M")
#' @export
resizeCluster <- function(cluster,
dedicatedMin,
dedicatedMax,
Expand Down
48 changes: 31 additions & 17 deletions samples/resource_files_example.R
Original file line number Diff line number Diff line change
Expand Up @@ -16,27 +16,42 @@ generateCredentialsConfig("credentials.json")
# set azure credentials
setCredentials("credentials.json")

# create credentials config files
generateClusterConfig("cluster_settings.json")

# Add data.table package to the CRAN packages and Azure/rAzureBatch to the Github packages
# in order to install the packages to all of the nodes
# Since reading the large datasets cost high memory, we recommend using Standard_D11_v2
# "rPackages": {
# "cran": ["data.table"],
# "github": ["Azure/rAzureBatch", "Azure/doAzureParallel"]
# }

# ===================================================
# === Setting up your cluster with resource files ===
# ===================================================

# Now we will use resource-files to upload our dataset onto each node of our cluster.
# Currently, our data is stored in Azure Blob in an account called 'playdatastore',
# in a public container called "nyc-taxi-dataset". To get this dataset onto each node,
# Currently, our data is stored in Azure Blob in an account called 'playdatastore',
# in a public container called "nyc-taxi-dataset". To get this dataset onto each node,
# we will create a resouceFile object for each blob - we will then use the resourceFile
# when building the cluster so that each node in the cluster knows to download these files
# after the node is provisioned.
# Using the NYC taxi datasets, http://www.nyc.gov/html/tlc/html/about/trip_record_data.shtml
azureStorageUrl <- "http://playdatastore.blob.core.windows.net/nyc-taxi-dataset"
resource_files <- list(
createResourceFile(url = paste0(azureStorageUrl, "/yellow_tripdata_2016-01.csv"), fileName = "yellow_tripdata_2016-01.csv"),
createResourceFile(url = paste0(azureStorageUrl, "/yellow_tripdata_2016-02.csv"), fileName = "yellow_tripdata_2016-02.csv"),
createResourceFile(url = paste0(azureStorageUrl, "/yellow_tripdata_2016-03.csv"), fileName = "yellow_tripdata_2016-03.csv"),
createResourceFile(url = paste0(azureStorageUrl, "/yellow_tripdata_2016-04.csv"), fileName = "yellow_tripdata_2016-04.csv"),
createResourceFile(url = paste0(azureStorageUrl, "/yellow_tripdata_2016-05.csv"), fileName = "yellow_tripdata_2016-05.csv"),
createResourceFile(url = paste0(azureStorageUrl, "/yellow_tripdata_2016-06.csv"), fileName = "yellow_tripdata_2016-06.csv"),
createResourceFile(url = paste0(azureStorageUrl, "/yellow_tripdata_2016-07.csv"), fileName = "yellow_tripdata_2016-07.csv"),
createResourceFile(url = paste0(azureStorageUrl, "/yellow_tripdata_2016-08.csv"), fileName = "yellow_tripdata_2016-08.csv")
createResourceFile(url = paste0(azureStorageUrl, "/yellow_tripdata_2016-1.csv"), fileName = "yellow_tripdata_2016-1.csv"),
createResourceFile(url = paste0(azureStorageUrl, "/yellow_tripdata_2016-2.csv"), fileName = "yellow_tripdata_2016-2.csv"),
createResourceFile(url = paste0(azureStorageUrl, "/yellow_tripdata_2016-3.csv"), fileName = "yellow_tripdata_2016-3.csv"),
createResourceFile(url = paste0(azureStorageUrl, "/yellow_tripdata_2016-4.csv"), fileName = "yellow_tripdata_2016-4.csv"),
createResourceFile(url = paste0(azureStorageUrl, "/yellow_tripdata_2016-5.csv"), fileName = "yellow_tripdata_2016-5.csv"),
createResourceFile(url = paste0(azureStorageUrl, "/yellow_tripdata_2016-6.csv"), fileName = "yellow_tripdata_2016-6.csv"),
createResourceFile(url = paste0(azureStorageUrl, "/yellow_tripdata_2016-7.csv"), fileName = "yellow_tripdata_2016-7.csv"),
createResourceFile(url = paste0(azureStorageUrl, "/yellow_tripdata_2016-8.csv"), fileName = "yellow_tripdata_2016-8.csv"),
createResourceFile(url = paste0(azureStorageUrl, "/yellow_tripdata_2016-9.csv"), fileName = "yellow_tripdata_2016-9.csv"),
createResourceFile(url = paste0(azureStorageUrl, "/yellow_tripdata_2016-10.csv"), fileName = "yellow_tripdata_2016-10.csv"),
createResourceFile(url = paste0(azureStorageUrl, "/yellow_tripdata_2016-11.csv"), fileName = "yellow_tripdata_2016-11.csv"),
createResourceFile(url = paste0(azureStorageUrl, "/yellow_tripdata_2016-12.csv"), fileName = "yellow_tripdata_2016-12.csv")
)

# add the parameter 'resourceFiles' to download files to nodes
Expand All @@ -45,20 +60,19 @@ cluster <- makeCluster("cluster_settings.json", resourceFiles = resource_files)
# when the cluster is provisioned, register the cluster as your parallel backend
registerDoAzureParallel(cluster)


# ======================================================
# === Setting up storage account to write results to ===
# ======================================================

# Setup storage location to write your results to:
# This step will allow your to upload your results from within your doAzureParallel foreach loop:
#
#
# 1. Replace the "mystorageaccount" with the name of the storage account you wish to write your results to.
# 2. Create an output container named "nyc-taxi-graphs" to store your results in
# 3. Create a SasToken that allows us to write ("w") to the container
# 4. Notice the parameter 'path = "c"' in the createSasToken method, this
# 4. Notice the parameter 'path = "c"' in the createSasToken method, this
# simply means that the token is created for that entire container in storage
#
#
storageAccountName <- "mystorageaccount"
outputsContainer <- "nyc-taxi-graphs"
createContainer(outputsContainer)
Expand All @@ -68,8 +82,8 @@ outputSas <- createSasToken(permission = "w", path = "c", outputsContainer)
# === Foreach with resourceFiles & writing to storage ===
# =======================================================

results <- foreach(i = 1:8, .packages = c("data.table", "ggplot2", "rAzureBatch")) %dopar% {
results <- foreach(i = 1:12, .packages = c("data.table", "ggplot2", "rAzureBatch")) %dopar% {

# To get access to your azure resource files, user needs to use the special
# environment variable to get the directory
fileDirectory <- paste0(Sys.getenv("AZ_BATCH_NODE_STARTUP_DIR"), "/wd")
Expand All @@ -78,7 +92,7 @@ results <- foreach(i = 1:8, .packages = c("data.table", "ggplot2", "rAzureBatch"
colsToKeep <- c("pickup_longitude", "pickup_latitude", "dropoff_longitude", "dropoff_latitude", "tip_amount", "trip_distance")

# read in data from CSV that was downloaded from the resource file
file <- fread(paste0(fileDirectory, "/yellow_tripdata_2016-0", i, ".csv"), select = colsToKeep)
file <- fread(paste0(fileDirectory, "/yellow_tripdata_2016-", i, ".csv"), select = colsToKeep)

# set the coordinates for the bounds of the plot
min_lat <- 40.5774
Expand Down

0 comments on commit 8c35a36

Please sign in to comment.