re-enable e2e UI tests on CI (#1961)

#1692 is still open. This PR is not an ideal approach, but it's a quick win while we wait for that issue to be resolved. By retrying failing tests up to 3 times, we _should_ be fine to re-enable these on CI. If a test is failing > 3 times, there's likely a legitimate issue occuring.
grafana · May 23, 2023 · c793e55 · c793e55
1 parent 06bd045
commit c793e55
Show file tree

Hide file tree

Showing 9 changed files with 67 additions and 99 deletions.
diff --git a/.github/workflows/linting-and-tests.yml b/.github/workflows/linting-and-tests.yml
@@ -262,15 +262,24 @@ jobs:
           pytest -x
 
   end-to-end-tests:
-    # TODO: reenable this job once https://github.com/grafana/oncall/issues/1692 is fixed
-    if: ${{ false }}
-    runs-on: ubuntu-latest
+    # default "ubuntu-latest" runners only provide 2 CPU cores + 7GB of RAM. this seems to lead to HTTP 504s from
+    # the oncall backend, and hence, flaky tests. Let's use CI runners w/ more resources to avoid this (plus
+    # this will allow us to run more backend containers and parralelize the tests)
+    runs-on: ubuntu-latest-8-cores
     name: "End to end tests - Grafana: ${{ matrix.grafana-image-tag }}"
     strategy:
       matrix:
         grafana-image-tag:
-          - 8.5.22
-          - 9.2.6
+          # OnCall doesn't work on the following versions of Grafana
+          # - 8.5.22
+          # - 9.0.0
+          # - 9.1.0
+
+          # 9.2.0 is the earliest version where things work
+          - 9.2.13
+          - 9.3.14
+          - 9.4.10
+          - 9.5.2
           - main
           - latest
       fail-fast: false
@@ -331,10 +340,9 @@ jobs:
       - name: Load engine Docker image on the nodes of the cluster
         run: kind load image-archive --name=chart-testing /tmp/oncall-engine.tar
 
-      # spin up 2 engine, 2 celery, and 2 grafana pods, this will allow us to parralelize the integration tests
+      # spin up 3 engine, 3 celery, and 3 grafana pods, this will allow us to parralelize the integration tests,
       # and complete them much faster by using multiple test processes
-      # With just 1 engine/celery/grafana pod, the backend crawls to a halt when there is > 1 parallelized integration
-      # test process
+      # With just 1 engine/celery/grafana pod, the backend crawls to a halt when there is > 1 parallelized integration test process
       #
       # by settings grafana.plugins to [] and configuring grafana.extraVolumeMounts we are using the locally built
       # OnCall plugin rather than the latest published version
@@ -346,14 +354,14 @@ jobs:
             --values ./helm/simple.yml \
             --values ./helm/values-local-image.yml \
             --set-json 'env=[{"name":"GRAFANA_CLOUD_NOTIFICATIONS_ENABLED","value":"False"}]' \
-            --set engine.replicaCount=1 \
-            --set celery.replicaCount=1 \
+            --set engine.replicaCount=3 \
+            --set celery.replicaCount=3 \
             --set celery.worker_beat_enabled="False" \
             --set oncall.twilio.accountSid="${{ secrets.TWILIO_ACCOUNT_SID }}" \
             --set oncall.twilio.authToken="${{ secrets.TWILIO_AUTH_TOKEN }}" \
             --set oncall.twilio.phoneNumber="\"${{ secrets.TWILIO_PHONE_NUMBER }}"\" \
             --set oncall.twilio.verifySid="${{ secrets.TWILIO_VERIFY_SID }}" \
-            --set grafana.replicas=1 \
+            --set grafana.replicas=3 \
             --set grafana.image.tag=${{ matrix.grafana-image-tag }} \
             --set grafana.env.GF_SECURITY_ADMIN_USER=oncall \
             --set grafana.env.GF_SECURITY_ADMIN_PASSWORD=oncall \
@@ -378,12 +386,19 @@ jobs:
           path: "~/.cache/ms-playwright"
           key: ${{ runner.os }}-playwright-${{ env.PLAYWRIGHT_VERSION }}-chromium-firefox-webkit
 
-      - name: Install Playwright binaries/dependencies
+      # For the next two steps, use the binary directly from node_modules/.bin as opposed to npx playwright
+      # due to this bug (https://github.com/microsoft/playwright/issues/13188)
+      - name: Install Playwright Browsers
         if: steps.playwright-cache.outputs.cache-hit != 'true'
-        # https://stackoverflow.com/questions/65900299/install-single-dependency-from-package-json-with-yarn
-        run: |
-          yarn add "@playwright/test@${{ env.PLAYWRIGHT_VERSION }}"
-          npx playwright install --with-deps chromium firefox webkit
+        working-directory: grafana-plugin
+        run: ./node_modules/.bin/playwright install --with-deps chromium firefox webkit
+
+      # use the cached browsers, but we still need to install the necessary system dependencies
+      # (system deps are installed in the cache-miss step above by the --with-deps flag)
+      - name: Install Playwright System Dependencies
+        if: steps.playwright-cache.outputs.cache-hit == 'true'
+        working-directory: grafana-plugin
+        run: ./node_modules/.bin/playwright install-deps chromium firefox webkit
 
       - name: Await k8s pods and other resources up
         uses: jupyterhub/action-k8s-await-workloads@v1
@@ -408,8 +423,7 @@ jobs:
           GRAFANA_PASSWORD: oncall
           MAILSLURP_API_KEY: ${{ secrets.MAILSLURP_API_KEY }}
         working-directory: ./grafana-plugin
-        # -x = exit command after first failing test
-        run: yarn test:integration -x
+        run: yarn test:integration
 
       # always spit out the engine and celery logs, AFTER the e2e tests have completed
       # can be helpful for debugging failing/flaky tests

diff --git a/grafana-plugin/integration-tests/globalSetup.ts b/grafana-plugin/integration-tests/globalSetup.ts
@@ -7,7 +7,7 @@ import { goToGrafanaPage } from './utils/navigation';
 /**
  * go to config page and wait for plugin icon to be available on left-hand navigation
  */
-export const configureOnCallPlugin = async (page: Page): Promise<void> => {
+const configureOnCallPlugin = async (page: Page): Promise<void> => {
   // plugin configuration can safely be skipped for non open-source environments
   if (!IS_OPEN_SOURCE) {
     return;
@@ -31,8 +31,14 @@ export const configureOnCallPlugin = async (page: Page): Promise<void> => {
     await clickButton({ page, buttonText: 'Connect' });
   }
 
-  // wait for the "Connected to OnCall" message to know that everything is properly configured
-  await expect(page.getByTestId('status-message-block')).toHaveText(/Connected to OnCall.*/);
+  /**
+   * wait for the "Connected to OnCall" message to know that everything is properly configured
+   *
+   * Regarding increasing the timeout for the "plugin configured" assertion:
+   * This is because it can sometimes take a bit longer for the backend sync to finish. The default assertion
+   * timeout is 5s, which is sometimes not enough if the backend is under load
+   */
+  await expect(page.getByTestId('status-message-block')).toHaveText(/Connected to OnCall.*/, { timeout: 25_000 });
 };
 
 /**

diff --git a/grafana-plugin/integration-tests/schedules/quality.test.ts b/grafana-plugin/integration-tests/schedules/quality.test.ts
@@ -6,13 +6,24 @@ test('check schedule quality for simple 1-user schedule', async ({ page }) => {
   const onCallScheduleName = generateRandomValue();
   await createOnCallSchedule(page, onCallScheduleName);
 
-  await expect(page.locator('div[class*="ScheduleQuality"]')).toHaveText('Quality: Great');
+  /**
+   * this page.reload() call is a hack to temporarily get around this issue
+   * https://github.com/grafana/oncall/issues/1968
+   */
+  await page.reload({ waitUntil: 'networkidle' });
 
-  await page.hover('div[class*="ScheduleQuality"]');
-  await expect(page.locator('div[class*="ScheduleQualityDetails"] >> span[class*="Text"] >> nth=2 ')).toHaveText(
+  const scheduleQualityElement = page.getByTestId('schedule-quality');
+
+  await expect(scheduleQualityElement).toHaveText('Quality: Great', { timeout: 15_000 });
+
+  await scheduleQualityElement.hover();
+
+  const scheduleQualityDetailsElement = page.getByTestId('schedule-quality-details');
+
+  await expect(scheduleQualityDetailsElement.locator('span[class*="Text"] >> nth=2 ')).toHaveText(
     'Schedule has no gaps'
   );
-  await expect(page.locator('div[class*="ScheduleQualityDetails"] >> span[class*="Text"] >> nth=3 ')).toHaveText(
+  await expect(scheduleQualityDetailsElement.locator('span[class*="Text"] >> nth=3 ')).toHaveText(
     'Schedule is perfectly balanced'
   );
 });
diff --git a/grafana-plugin/playwright.config.ts b/grafana-plugin/playwright.config.ts
@@ -14,7 +14,7 @@ const config: PlaywrightTestConfig = {
   testDir: './integration-tests',
   globalSetup: './integration-tests/globalSetup.ts',
   /* Maximum time one test can run for. */
-  timeout: 90 * 1000,
+  timeout: 60 * 1000,
   expect: {
     /**
      * Maximum time expect() should wait for the condition to be met.
@@ -26,8 +26,13 @@ const config: PlaywrightTestConfig = {
   fullyParallel: true,
   /* Fail the build on CI if you accidentally left test.only in the source code. */
   forbidOnly: !!process.env.CI,
-  /* Retry on CI only */
-  retries: process.env.CI ? 3 : 0,
+  /**
+   * Retry on CI only
+   *
+   * NOTE: until we fix this issue (https://github.com/grafana/oncall/issues/1692) which occasionally leads
+   * to flaky tests.. let's just retry failed tests. If the same test fails 3 times, you know something must be up
+   */
+  retries: !!process.env.CI ? 3 : 0,
   workers: 1,
   /* Reporter to use. See https://playwright.dev/docs/test-reporters */
   reporter: 'html',

diff --git a/grafana-plugin/src/components/EscalationsFilters/EscalationsFilters.module.css b/grafana-plugin/src/components/EscalationsFilters/EscalationsFilters.module.css
diff --git a/grafana-plugin/src/components/EscalationsFilters/EscalationsFilters.tsx b/grafana-plugin/src/components/EscalationsFilters/EscalationsFilters.tsx
diff --git a/grafana-plugin/src/components/ScheduleQuality/ScheduleQuality.tsx b/grafana-plugin/src/components/ScheduleQuality/ScheduleQuality.tsx
@@ -38,7 +38,7 @@ const ScheduleQuality: FC<ScheduleQualityProps> = ({ schedule, lastUpdated }) =>
 
   return (
     <>
-      <div className={cx('root')}>
+      <div className={cx('root')} data-testid="schedule-quality">
         {relatedEscalationChains?.length > 0 && schedule?.number_of_escalation_chains > 0 && (
           <TooltipBadge
             borderType="link"

diff --git a/grafana-plugin/src/components/ScheduleQualityDetails/ScheduleQualityDetails.tsx b/grafana-plugin/src/components/ScheduleQualityDetails/ScheduleQualityDetails.tsx
@@ -29,7 +29,7 @@ export const ScheduleQualityDetails: FC<ScheduleQualityDetailsProps> = ({ qualit
   const warningComments = comments.filter((c) => c.type === 'warning');
 
   return (
-    <div className={cx('root')}>
+    <div className={cx('root')} data-testid="schedule-quality-details">
       <div className={cx('container')}>
         <div className={cx('container', 'container--withLateralPadding')}>
           <Text type={cx('secondary', 'header')}>

diff --git a/grafana-plugin/src/plugin.json b/grafana-plugin/src/plugin.json
@@ -620,8 +620,7 @@
     }
   ],
   "dependencies": {
-    "grafanaDependency": ">=8.3.2",
-    "grafanaVersion": "8.3",
+    "grafanaDependency": ">=9.2.0",
     "plugins": []
   }
 }