Skip to content

Commit

Permalink
Create a component to submit the Dataflow job to compute embeddings f…
Browse files Browse the repository at this point in the history
…or code search.

* Update Beam to 2.8.0
* Remove nmslib from Apache beam requirements.txt; its not needed and appears
  to have problems installing on the Dataflow workers.

* Spacy download was failing on Dataflow workers; reinstalling the spacy
  package as a pip package appears to fix this.

* Fix some bugs in the workflow for building the Docker images.
  • Loading branch information
jlewi committed Nov 10, 2018
1 parent 7581888 commit 2669a54
Show file tree
Hide file tree
Showing 11 changed files with 155 additions and 19 deletions.
18 changes: 9 additions & 9 deletions code_search/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ build-cpu:
@echo Built $(IMG):$(TAG)

# TODO(jlewi): We could always use build.jsonnet and then just
# Parse out the docker build command.
# Parse out the docker build command.
build-gpu:
docker build -f "./docker/t2t/Dockerfile" \
-t $(IMG)-gpu:$(TAG) \
Expand All @@ -42,41 +42,41 @@ build-gpu:
build-dataflow:
docker build -f "./docker/t2t/Dockerfile.dataflow" \
-t $(IMG)-dataflow:$(TAG) \
--label=git-versions=$(GIT_VERSION)
--label=git-versions=$(GIT_VERSION) \
.
@echo Built $(IMG)-dataflow:$(TAG)

build: build-cpu build-gpu build-dataflow

# Build using GCB. This is useful if we are on a slow internet connection
# and don't want to pull
# and don't want to pull
build-gcb:
mkdir -p build
jsonnet ./docker/t2t/build.jsonnet --ext-str gitVersion=$(GIT_VERSION) --ext-str tag=$(TAG) \
> ./build/build.json
> ./build/build.json
cp -r ./docker ./build/
cp -r ./src ../build/
cp -r ./src ./build/
rm -rf ./build/src/code_search/dataflow/cli/test_data
rm -rf ./build/src/code_search/t2t/test_data
gcloud builds submit --project=kubeflow-ci --config=./build/build.json ./build
gcloud builds submit --machine-type=n1-highcpu-32 --project=kubeflow-ci --config=./build/build.json ./build


# Build but don't attach the latest tag. This allows manual testing/inspection of the image
# first.
push-cpu: build-cpu
gcloud docker --authorize-only
gcloud docker --authorize-only
docker push $(IMG):$(TAG)
@echo Pushed $(IMG):$(TAG)

push-gpu: build-gpu
gcloud docker --authorize-only
gcloud docker --authorize-only
docker push $(IMG)-gpu:$(TAG)
@echo Pushed $(IMG)-gpu:$(TAG)

push-trainer: push-cpu push-gpu

push-dataflow: build-dataflow
gcloud docker --authorize-only
gcloud docker --authorize-only
docker push $(IMG)-dataflow:$(TAG)
@echo Pushed $(IMG)-dataflow:$(TAG)

Expand Down
27 changes: 20 additions & 7 deletions code_search/docker/t2t/build.jsonnet
Original file line number Diff line number Diff line change
@@ -1,21 +1,34 @@
// TODO(jlewi): We should tag the image latest and then
// use latest as a cache so that rebuilds are fast
// https://cloud.google.com/cloud-build/docs/speeding-up-builds#using_a_cached_docker_image
{

"steps": [
{
"name": "gcr.io/cloud-builders/docker",
"args": ["build", "-t", "gcr.io/kubeflow-examples/code-search:" + std.extVar("tag"),
"--label=git-versions=" + std.extVar("gitVersion"),
"--build-arg", "BASE_IMAGE_TAG=1.11.0",
"./docker/t2t"],
"--label=git-versions=" + std.extVar("gitVersion"),
"--build-arg", "BASE_IMAGE_TAG=1.11.0",
"--file=docker/t2t/Dockerfile", "."],
"waitFor": ["-"],
},
{
"name": "gcr.io/cloud-builders/docker",
"args": ["build", "-t", "gcr.io/kubeflow-examples/code-search-gpu:" + std.extVar("tag"),
"--label=git-versions=" + std.extVar("gitVersion"),
"--build-arg", "BASE_IMAGE_TAG=1.11.0-gpu",
"./docker/t2t"],
"--label=git-versions=" + std.extVar("gitVersion"),
"--build-arg", "BASE_IMAGE_TAG=1.11.0-gpu",
"--file=docker/t2t/Dockerfile", "."],
"waitFor": ["-"],
},
{
"name": "gcr.io/cloud-builders/docker",
"args": ["build", "-t", "gcr.io/kubeflow-examples/code-search-dataflow:" + std.extVar("tag"),
"--label=git-versions=" + std.extVar("gitVersion"),
"--file=docker/t2t/Dockerfile.dataflow", "."],
"waitFor": ["-"],
},
],
"images": ["gcr.io/kubeflow-examples/code-search:" + std.extVar("tag"),
"gcr.io/kubeflow-examples/code-search-gpu:" + std.extVar("tag")],
"gcr.io/kubeflow-examples/code-search-gpu:" + std.extVar("tag"),
"gcr.io/kubeflow-examples/code-search-dataflow:" + std.extVar("tag")],
}
Empty file added code_search/kubeflow/README.md
Empty file.
4 changes: 4 additions & 0 deletions code_search/kubeflow/components/experiments.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,9 @@
train_steps: 200000,
eval_steps: 100,
hparams_set: "transformer_base",
project: "code-search-demo",
modelDir: "gs://code-search-demo/models/20181107-dist-sync-gpu/export/1541712907/",
problem: "kf_github_function_docstring",
model: "kf_similarity_transformer",
},
}
16 changes: 15 additions & 1 deletion code_search/kubeflow/components/params.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
eval_steps: 10,
image: "gcr.io/kubeflow-examples/code-search:" + imageTag,
imageGpu: "gcr.io/kubeflow-examples/code-search-gpu:" + imageTag,
dataflowImage: "gcr.io/kubeflow-examples/code-search-dataflow:v20181106-v0.2-76-g611636c-dirty-860631",
dataflowImage: "gcr.io/kubeflow-examples/code-search-dataflow:v20181109-dc79384",

imagePullSecrets: [],
// TODO(jlewi): dataDir doesn't seem to be used.
Expand Down Expand Up @@ -106,6 +106,20 @@
numWorkers: 5,
project: "",
},
"submit-code-embeddings-job": {
name: "submit-code-embeddings-job",
image: $.components["t2t-job"].dataflowImage,
// Big query table where results will be written.
targetDataset: "code_search",
workingDir: $.components["t2t-code-search"].workingDir,
dataDir: self.workingDir + "/data",
// Directory where the model is stored.
modelDir: "",
jobName: "submit-code-embeddings-job",
workerMachineType: "n1-highcpu-32",
numWorkers: 5,
project: "",
},

tensorboard: {
image: "tensorflow/tensorflow:1.8.0",
Expand Down
14 changes: 14 additions & 0 deletions code_search/kubeflow/components/submit-code-embeddings-job.jsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
// Submit a Dataflow job to compute the code embeddings used a trained model.
local k = import "k.libsonnet";

local experiments = import "experiments.libsonnet";
local lib = import "submit-code-embeddings-job.libsonnet";
local env = std.extVar("__ksonnet/environments");
local baseParams = std.extVar("__ksonnet/params").components["submit-code-embeddings-job"];
local experimentName = baseParams.experiment;
local params = baseParams + experiments[experimentName] + {
name: experimentName + "-embed-code",
};


std.prune(k.core.v1.list.new([lib.parts(params,env).job]))
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
{
parts(params, env):: {
// Submit a Dataflow job to compute the code embeddings used a trained model.
job :: {
apiVersion: "batch/v1",
kind: "Job",
metadata: {
name: params.name,
namespace: env.namespace,
labels: {
app: params.name,
},
},
spec: {
replicas: 1,
template: {
metadata: {
labels: {
app: params.name,
},
},
spec: {
// Don't restart because all the job should do is launch the Dataflow job.
restartPolicy: "Never",
containers: [
{
name: "dataflow",
image: params.image,
command: [
"python2",
"-m",
"code_search.dataflow.cli.create_function_embeddings",
"--runner=DataflowRunner",
"--project=" + params.project,
"--target_dataset=" + params.targetDataset,
"--data_dir=" + params.dataDir,
"--problem=" + params.problem,
"--job_name=" + params.jobName,
"--saved_model_dir=" + params.modelDir,
"--temp_location=" + params.workingDir + "/dataflow/temp",
"--staging_location=" + params.workingDir + "/dataflow/staging",
"--worker_machine_type=" + params.workerMachineType,
"--num_workers=" + params.numWorkers,
],
env: [
{
name: "GOOGLE_APPLICATION_CREDENTIALS",
value: "/secret/gcp-credentials/user-gcp-sa.json",
},
],
workingDir: "/src",
volumeMounts: [
{
mountPath: "/secret/gcp-credentials",
name: "gcp-credentials",
},
], //volumeMounts
},
], // containers
volumes: [
{
name: "gcp-credentials",
secret: {
secretName: "user-gcp-sa",
},
},
],
}, // spec
},
},
}, // job
}, // parts
}
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,5 @@
workingDir: "gs://code-search-demo/20181104",
dataDir: "gs://code-search-demo/20181104/data",
project: "code-search-demo",
experiment: "demo-trainer-11-07-dist-sync-gpu",
}
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
"""Dataflow job to compute function embeddings."""
import logging

import apache_beam as beam

import code_search.dataflow.cli.arguments as arguments
Expand Down Expand Up @@ -45,9 +48,16 @@ def create_function_embeddings(argv=None):
)

result = pipeline.run()
logging.info("Submitted Dataflow job: %s", result)
if args.wait_until_finish:
result.wait_until_finish()


if __name__ == '__main__':
logging.basicConfig(level=logging.INFO,
format=('%(levelname)s|%(asctime)s'
'|%(pathname)s|%(lineno)d| %(message)s'),
datefmt='%Y-%m-%dT%H:%M:%S',
)
logging.getLogger().setLevel(logging.INFO)
create_function_embeddings()
7 changes: 5 additions & 2 deletions code_search/src/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
astor~=0.7.0
apache-beam[gcp]~=2.6.0
apache-beam[gcp]~=2.8.0
Flask~=1.0.0
nltk~=3.3.0
nmslib~=1.7.0
# TODO(jlewi): nmslib builds are failing on Dataflow workers with Apache beam 2.8.0.
# We shouldn't need nmslib in the Dataflow jobs.
# nmslib~=1.7.0
oauth2client~=4.1.0
requests~=2.19.0
spacy~=2.0.0
tensor2tensor~=1.9.0
tensorflow~=1.11.0
pybind11~=2.2.4
4 changes: 4 additions & 0 deletions code_search/src/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@
install_requires = f.readlines()

CUSTOM_COMMANDS = [
# TODO(jlewi): python -m is complaining that module spacy not found even
# though it should be installed due to requirements. Reinstalling
# it using a custom command appears to fix the problem.
['pip', 'install', 'spacy'],
['python', '-m', 'spacy', 'download', 'en'],
# TODO(sanyamkapoor): This isn't ideal but no other way for a seamless install right now.
['pip', 'install', 'https://github.com/kubeflow/batch-predict/tarball/master']
Expand Down

0 comments on commit 2669a54

Please sign in to comment.