Skip to content
This repository has been archived by the owner on Mar 23, 2019. It is now read-only.

Build gets stuck (non-deterministic behavior) #962

Open
jkovacevic opened this issue Aug 16, 2018 · 2 comments
Open

Build gets stuck (non-deterministic behavior) #962

jkovacevic opened this issue Aug 16, 2018 · 2 comments
Labels
needinfo stale-communication Candidate for the closure due to stale communication

Comments

@jkovacevic
Copy link

jkovacevic commented Aug 16, 2018

ISSUE TYPE
  • Bug Report
container.yml
version: "2"

# This is the docker-container config file. For more info check http://docs.ansible.com/ansible-container/
settings:
  conductor:
    # The Conductor container does the heavy lifting, and provides a portable
    # Python runtime for building your target containers. It should be derived
    # from the same distribution as you're building your target containers with.
    base: ubuntu:xenial
    roles_path:   # Specify a local path containing Ansible roles
    - ./roles/java_service
    # volumes:      # Provide a list of volumes to mount
    # environment:  # List or mapping of environment variables

  # Set the name of the project. Defaults to basename of the project directory.
  # For built services, concatenated with service name to form the built image name.
  project_name: smart-platform

  # The deployment_output_path is mounted to the Conductor container, and the
  # `run` and `deployment` commands then write generated Ansible playbooks to it.
  # deployment_output_path: ./ansible-deployment

services:
  gateway:
    from: store/oracle/serverjre:8
    roles:
      - role: java_service
        vars:
          service: smart-platform-gateway-service
          service_name: "Gateway"
          service_description: "Smart Platform Gateway service"
        environment:
          AWS_SESSION_TOKEN: "{{ aws_session_token }}"
          AWS_ACCESS_KEY_ID: "{{ aws_access_key_id }}"
          AWS_SECRET_ACCESS_KEY: "{{ aws_secret_access_key }}"
    command: [/app/run_service.sh]
    entrypoint: [/usr/bin/entrypoint.sh]
    network_mode: "host"

  stf:
    from: store/oracle/serverjre:8
    roles:
      - role: java_service
        vars:
          service: supply-traffic-filtering-service
          service_name: "STF"
          service_description: "Supply Traffic Filtering service"
        environment:
          AWS_SESSION_TOKEN: "{{ aws_session_token }}"
          AWS_ACCESS_KEY_ID: "{{ aws_access_key_id }}"
          AWS_SECRET_ACCESS_KEY: "{{ aws_secret_access_key }}"
    command: [/app/run_service.sh]
    entrypoint: [/usr/bin/entrypoint.sh]
    network_mode: "host"

  dtf:
    from: store/oracle/serverjre:8
    roles:
      - role: java_service
        vars:
          service: demand-traffic-filtering-service
          service_name: "DTF"
          service_description: "Demand Traffic Filtering service"
        environment:
          AWS_SESSION_TOKEN: "{{ aws_session_token }}"
          AWS_ACCESS_KEY_ID: "{{ aws_access_key_id }}"
          AWS_SECRET_ACCESS_KEY: "{{ aws_secret_access_key }}"
    command: [/app/run_service.sh]
    entrypoint: [/usr/bin/entrypoint.sh]
    network_mode: "host"
OS / ENVIRONMENT
Ansible Container, version 0.9.2
Linux, SM393, 4.15.0-32-generic, #35~16.04.1-Ubuntu SMP Fri Aug 10 21:54:34 UTC 2018, x86_64
3.5.2 (default, Nov 23 2017, 16:37:01) 
[GCC 5.4.0 20160609] /home/janko/IdeaProjects/smart-platform-infra/ansible/venv/bin/python3.5
{
  "DefaultRuntime": "runc",
  "Name": "SM393",
  "IndexServerAddress": "https://index.docker.io/v1/",
  "SystemStatus": null,
  "ClusterStore": "",
  "MemTotal": 25027665920,
  "NGoroutines": 34,
  "GenericResources": null,
  "ContainersPaused": 0,
  "ContainersRunning": 0,
  "CpuCfsPeriod": true,
  "IPv4Forwarding": true,
  "BridgeNfIp6tables": true,
  "ServerVersion": "18.03.1-ce",
  "Driver": "overlay2",
  "Containers": 0,
  "CPUShares": true,
  "OSType": "linux",
  "CPUSet": true,
  "NoProxy": "",
  "MemoryLimit": true,
  "InitBinary": "docker-init",
  "OperatingSystem": "Ubuntu 16.04.4 LTS",
  "SwapLimit": false,
  "KernelMemory": true,
  "SystemTime": "2018-08-16T16:56:52.967705832+02:00",
  "ExperimentalBuild": false,
  "HttpsProxy": "",
  "RegistryConfig": {
    "IndexConfigs": {
      "docker.io": {
        "Official": true,
        "Name": "docker.io",
        "Mirrors": [],
        "Secure": true
      }
    },
    "Mirrors": [],
    "AllowNondistributableArtifactsCIDRs": [],
    "AllowNondistributableArtifactsHostnames": [],
    "InsecureRegistryCIDRs": [
      "127.0.0.0/8"
    ]
  },
  "Plugins": {
    "Log": [
      "awslogs",
      "fluentd",
      "gcplogs",
      "gelf",
      "journald",
      "json-file",
      "logentries",
      "splunk",
      "syslog"
    ],
    "Network": [
      "bridge",
      "host",
      "macvlan",
      "null",
      "overlay"
    ],
    "Volume": [
      "local"
    ],
    "Authorization": null
  },
  "RuncCommit": {
    "ID": "4fc53a81fb7c994640722ac585fa9ca548971871",
    "Expected": "4fc53a81fb7c994640722ac585fa9ca548971871"
  },
  "ClusterAdvertise": "",
  "LoggingDriver": "json-file",
  "ID": "GU2L:7HDF:QXOM:P6TQ:BJTF:YJSP:TRNQ:CTWS:YV4K:SIY6:GVOI:UXBH",
  "NFd": 21,
  "NEventsListener": 0,
  "Swarm": {
    "ControlAvailable": false,
    "Error": "",
    "RemoteManagers": null,
    "NodeID": "",
    "LocalNodeState": "inactive",
    "NodeAddr": ""
  },
  "NCPU": 4,
  "LiveRestoreEnabled": false,
  "Architecture": "x86_64",
  "OomKillDisable": true,
  "InitCommit": {
    "ID": "949e6fa",
    "Expected": "949e6fa"
  },
  "Images": 2,
  "Debug": false,
  "KernelVersion": "4.15.0-32-generic",
  "BridgeNfIptables": true,
  "Labels": [],
  "DriverStatus": [
    [
      "Backing Filesystem",
      "extfs"
    ],
    [
      "Supports d_type",
      "true"
    ],
    [
      "Native Overlay Diff",
      "true"
    ]
  ],
  "ContainersStopped": 0,
  "ContainerdCommit": {
    "ID": "773c489c9c1b21a6d78b5c538cd395416ec50f88",
    "Expected": "773c489c9c1b21a6d78b5c538cd395416ec50f88"
  },
  "CpuCfsQuota": true,
  "Runtimes": {
    "runc": {
      "path": "docker-runc"
    }
  },
  "Isolation": "",
  "CgroupDriver": "cgroupfs",
  "SecurityOptions": [
    "name=apparmor",
    "name=seccomp,profile=default"
  ],
  "DockerRootDir": "/var/lib/docker",
  "HttpProxy": ""
}
{
  "MinAPIVersion": "1.12",
  "Version": "18.03.1-ce",
  "Os": "linux",
  "GoVersion": "go1.9.5",
  "Components": [
    {
      "Version": "18.03.1-ce",
      "Name": "Engine",
      "Details": {
        "BuildTime": "2018-04-26T07:15:30.000000000+00:00",
        "Experimental": "false",
        "MinAPIVersion": "1.12",
        "Os": "linux",
        "Arch": "amd64",
        "GoVersion": "go1.9.5",
        "GitCommit": "9ee9f40",
        "KernelVersion": "4.15.0-32-generic",
        "ApiVersion": "1.37"
      }
    }
  ],
  "GitCommit": "9ee9f40",
  "KernelVersion": "4.15.0-32-generic",
  "BuildTime": "2018-04-26T07:15:30.000000000+00:00",
  "Platform": {
    "Name": ""
  },
  "ApiVersion": "1.37",
  "Arch": "amd64"
}
SUMMARY

We are using ansible playbook to create docker images for our services. For conductor we image ansible/container-conductor-ubuntu-xenial:0.9.2 from dockerhub. Our services are created from /store/oracle/serverjre image.

We experience non-deterministic behavior while running ansible-container build; there is a chance that building will get stuck (this image displays point where it gets stuck: https://imgur.com/gChyVXI).
Command which is executed is given below:
ansible-container build --no-cache --services {{ component }}

In order to solve this problem, we have to stop execution (i.e. Ctrl+C it) and rerun again. It has approximately 40% chance to get stuck. Even if we build different services, behavior is non-deterministic.

After debugging, we realized that process gets stuck at this line of code:

command='sh -c "while true; do sleep 1; '
, therefore we suspect the problem is in race between threads.

STEPS TO REPRODUCE

Run: ansible-container build --no-cache --services {{ component }} within ansible playbook, until it gets stuck.

EXPECTED RESULTS

Image should be built in deterministic behavior.

ACTUAL RESULTS

Image building sometimes gets stuck.

@jkovacevic
Copy link
Author

jkovacevic commented Aug 29, 2018

I am willing to provide additional information regarding this problem if needed.
There was one machine, which is not available at the moment, which did not get stuck during build process. All other machines (mac OS, Ubuntu) fail unfortunately.

@Voronenko
Copy link
Contributor

@jkovacevic The best you can do is to isolate issue into reproducible state and most minimal form. Push it to some repo and publish on a github.

Screenshot gives me impression that root cause actually might be in applied role logic itself, rather than in ansible-container.

But for sure without POC repo with reproducible hang - it will be impossible to say smth

@Voronenko Voronenko added the stale-communication Candidate for the closure due to stale communication label Sep 29, 2018
Sign up for free to subscribe to this conversation on GitHub. Already have an account? Sign in.
Labels
needinfo stale-communication Candidate for the closure due to stale communication
Projects
None yet
Development

No branches or pull requests

2 participants