airbytehq · darynaishchenko · Feb 8, 2024 · Jan 19, 2024 · Jan 19, 2024 · Jan 19, 2024
@@ -1,5 +1,8 @@
 # Changelog
 
+## 3.4.0
+Add TestConnectorDocumentation suite for validating connectors documentation structure and content. 
+
 ## 3.3.3
 Аix `NoAdditionalPropertiesValidator` if no type found in `items` 
 

@@ -259,6 +259,11 @@ class ConnectorAttributesConfig(BaseConfig):
     )
 
 
+class TestConnectorDocumentationConfig(BaseConfig):
+    timeout_seconds: int = timeout_seconds
+    config_path: str = config_path
+
+
 class GenericTestConfig(GenericModel, Generic[TestConfigT]):
     bypass_reason: Optional[str]
     tests: Optional[List[TestConfigT]]
@@ -278,6 +283,7 @@ class AcceptanceTestConfigurations(BaseConfig):
     full_refresh: Optional[GenericTestConfig[FullRefreshConfig]]
     incremental: Optional[GenericTestConfig[IncrementalConfig]]
     connector_attributes: Optional[GenericTestConfig[ConnectorAttributesConfig]]
+    connector_documentation: Optional[GenericTestConfig[TestConnectorDocumentationConfig]]
 
 
 class Config(BaseConfig):

@@ -395,3 +395,16 @@ def pytest_sessionfinish(session, exitstatus):
 @pytest.fixture(name="connector_metadata")
 def connector_metadata_fixture(base_path) -> dict:
     return load_yaml_or_json_path(base_path / "metadata.yaml")
+
+
+@pytest.fixture(name="docs_path")
+def docs_path_fixture(base_path, connector_metadata) -> Path:
+    path_to_docs = connector_metadata["data"]["documentationUrl"].replace("https://docs.airbyte.com", "docs") + ".md"
+    airbyte_path = Path(base_path).parents[6]
+    return airbyte_path / path_to_docs
+
+
+@pytest.fixture(name="connector_documentation")
+def connector_documentation_fixture(docs_path: str) -> str:
+    with open(docs_path, "r") as f:
+        return f.read().rstrip()
@@ -2,8 +2,8 @@
 # Copyright (c) 2021 Airbyte, Inc., all rights reserved.
 #
 
-from .test_core import TestBasicRead, TestConnection, TestConnectorAttributes, TestDiscovery, TestSpec
+from .test_core import TestBasicRead, TestConnection, TestConnectorAttributes, TestDiscovery, TestSpec, TestConnectorDocumentation
 from .test_full_refresh import TestFullRefresh
 from .test_incremental import TestIncremental
 
-__all__ = ["TestSpec", "TestBasicRead", "TestConnection", "TestConnectorAttributes", "TestDiscovery", "TestFullRefresh", "TestIncremental"]
+__all__ = ["TestSpec", "TestBasicRead", "TestConnection", "TestConnectorAttributes", "TestDiscovery", "TestFullRefresh", "TestIncremental", "TestConnectorDocumentation"]
@@ -0,0 +1,5 @@
+
+1. [Log into your Airbyte Cloud](https://cloud.airbyte.com/workspaces) account.
+2. Click Sources and then click + New source/destination.
+3. On the Set up the source page, select {connector_name} from the Source type dropdown.
+4. Enter a name for the {connector_name} connector.
@@ -0,0 +1,2 @@
+
+1. Navigate to the Airbyte Open Source dashboard.
@@ -0,0 +1,6 @@
+
+<HideInUI>
+
+This page contains the setup guide and reference information for the [{connector_name}]({docs_link}) source connector.
+
+</HideInUI>
@@ -0,0 +1,2 @@
+
+The {connector_name} source connector supports the following [sync modes](https://docs.airbyte.com/cloud/core-concepts/#connection-sync-modes):
@@ -0,0 +1,2 @@
+
+Now that you have set up the {connector_name} source connector, check out the following {connector_name} tutorials:
@@ -10,12 +10,16 @@
 from functools import reduce
 from logging import Logger
 from os.path import splitext
+from pathlib import Path
+from threading import Thread
 from typing import Any, Dict, List, Mapping, MutableMapping, Optional, Set, Tuple
 from xmlrpc.client import Boolean
 
+import connector_acceptance_test.utils.docs as docs_utils
 import dpath.util
 import jsonschema
 import pytest
+import requests
 from airbyte_protocol.models import (
     AirbyteRecordMessage,
     AirbyteStream,
@@ -1335,3 +1339,192 @@ async def test_certified_connector_has_suggested_streams(
         assert (
             has_assigned_suggested_streams
         ), f"The `streams` empty list is not allowed for `metadata.data.suggestedStreams` for certified connectors."
+
+
+class TestConnectorDocumentation(BaseTest):
+    MANDATORY_FOR_TEST_STRICTNESS_LEVELS = []  # Used so that this is not part of the mandatory high strictness test suite yet
+
+    PREREQUISITES = "Prerequisites"
+    HEADING = "heading"
+    CREDENTIALS_KEYWORDS = ["account", "auth", "credentials", "access"]
+    CONNECTOR_SPECIFIC_HEADINGS = "<Connector-specific features>"
+
+    @pytest.fixture(name="operational_certification_test")
+    async def operational_certification_test_fixture(self, connector_metadata: dict) -> bool:
+        """
+        Fixture that is used to skip a test that is reserved only for connectors that are supposed to be tested
+        against operational certification criteria
+        """
+        if connector_metadata.get("data", {}).get("ab_internal", {}).get("ql") < 400:
+            pytest.skip("Skipping testing source connector documentation due to low ql.")
+        return True
+
+    def _get_template_headings(self, connector_name: str) -> tuple[tuple[str], tuple[str]]:
+        """
+        https://hackmd.io/Bz75cgATSbm7DjrAqgl4rw - standard template
+        Headings in order to docs structure.
+        """
+        all_headings = (
+            connector_name,
+            "Prerequisites",
+            "Setup guide",
+            f"Set up {connector_name}",
+            "For Airbyte Cloud:",
+            "For Airbyte Open Source:",
+            f"Set up the {connector_name} connector in Airbyte",
+            "For Airbyte Cloud:",
+            "For Airbyte Open Source:",
+            "Supported sync modes",
+            "Supported Streams",
+            self.CONNECTOR_SPECIFIC_HEADINGS,
+            "Performance considerations",
+            "Data type map",
+            "Troubleshooting",
+            "Tutorials",
+            "Changelog",
+        )
+        not_required_heading = (
+            f"Set up the {connector_name} connector in Airbyte",
+            "For Airbyte Cloud:",
+            "For Airbyte Open Source:",
+            self.CONNECTOR_SPECIFIC_HEADINGS,
+            "Performance considerations",
+            "Data type map",
+            "Troubleshooting",
+            "Tutorials",
+        )
+        return all_headings, not_required_heading
+
+    def _headings_description(self, connector_name: str) -> dict[str:Path]:
+        """
+        Headings with path to file with template description
+        """
+        descriptions_paths = {
+            connector_name: Path(__file__).parent / "doc_templates/source.txt",
+            "For Airbyte Cloud:": Path(__file__).parent / "doc_templates/for_airbyte_cloud.txt",
+            "For Airbyte Open Source:": Path(__file__).parent / "doc_templates/for_airbyte_open_source.txt",
+            "Supported sync modes": Path(__file__).parent / "doc_templates/supported_sync_modes.txt",
+            "Tutorials": Path(__file__).parent / "doc_templates/tutorials.txt",
+        }
+        return descriptions_paths
+
+    def test_prerequisites_content(
+        self, operational_certification_test, actual_connector_spec: ConnectorSpecification, connector_documentation: str, docs_path: str
+    ):
+        node = docs_utils.documentation_node(connector_documentation)
+        header_line_map = {docs_utils.header_name(n): n.map[1] for n in node if n.type == self.HEADING}
+        headings = tuple(header_line_map.keys())
+
+        if not header_line_map.get(self.PREREQUISITES):
+            pytest.fail(f"Documentation does not have {self.PREREQUISITES} section.")
+
+        prereq_start_line = header_line_map[self.PREREQUISITES]
+        prereq_end_line = docs_utils.description_end_line_index(self.PREREQUISITES, headings, header_line_map)
+
+        with open(docs_path, "r") as docs_file:
+            prereq_content_lines = docs_file.readlines()[prereq_start_line:prereq_end_line]
+            # adding real character to avoid accidentally joining lines into a wanted title.
+            prereq_content = "|".join(prereq_content_lines).lower()
+            required_titles, has_credentials = docs_utils.required_titles_from_spec(actual_connector_spec.connectionSpecification)
+
+            for title in required_titles:
+                assert title in prereq_content, (
+                    f"Required '{title}' field is not in {self.PREREQUISITES} section " f"or title in spec doesn't match name in the docs."
+                )
+
+            if has_credentials:
+                # credentials has specific check for keywords as we have a lot of way how to describe this step
+                credentials_validation = [k in prereq_content for k in self.CREDENTIALS_KEYWORDS]
+                assert True in credentials_validation, f"Required 'credentials' field is not in {self.PREREQUISITES} section."
+
+    def test_docs_structure(self, operational_certification_test, connector_documentation: str, connector_metadata: dict):
+        """
+        test_docs_structure gets all top-level headers from source documentation file and check that the order is correct.
+        The order of the headers should follow our standard template https://hackmd.io/Bz75cgATSbm7DjrAqgl4rw.
+        _get_template_headings returns tuple of headers as in standard template and non-required headers that might nor be in the source docs.
+        CONNECTOR_SPECIFIC_HEADINGS value in list of required headers that shows a place where should be a connector specific headers,
+        which can be skipped as out of standard template and depend of connector.
+        """
+
+        heading_names = docs_utils.prepare_headers(connector_documentation)
+        template_headings, non_required_heading = self._get_template_headings(connector_metadata["data"]["name"])
+
+        heading_names_len, template_headings_len = len(heading_names), len(template_headings)
+        heading_names_index, template_headings_index = 0, 0
+
+        while heading_names_index < heading_names_len and template_headings_index < template_headings_len:
+            heading_names_value = heading_names[heading_names_index]
+            template_headings_value = template_headings[template_headings_index]
+            # check that template header is specific for connector and actual header should not be validated
+            if template_headings_value == self.CONNECTOR_SPECIFIC_HEADINGS:
+                # check that actual header is not in required headers, as required headers should be on a right place and order
+                if heading_names_value not in template_headings:
+                    heading_names_index += 1  # go to the next actual header as CONNECTOR_SPECIFIC_HEADINGS can be more than one
+                    continue
+                else:
+                    # if actual header is required go to the next template header to validate actual header order
+                    template_headings_index += 1
+                    continue
+            # strict check that actual header equals template header
+            if heading_names_value == template_headings_value:
+                # found expected header, go to the next header in template and actual headers
+                heading_names_index += 1
+                template_headings_index += 1
+                continue
+            # actual header != template header means that template value is not required and can be skipped
+            if template_headings_value in non_required_heading:
+                # found non-required header, go to the next template header to validate actual header
+                template_headings_index += 1
+                continue
+            # any check is True, indexes didn't move to the next step
+            pytest.fail(docs_utils.reason_titles_not_match(heading_names_value, template_headings_value, template_headings))
+        # indexes didn't move to the last required one, so some headers are missed
+        if template_headings_index != template_headings_len:
+            pytest.fail(docs_utils.reason_missing_titles(template_headings_index, template_headings))
+
+    def test_docs_descriptions(
+        self, operational_certification_test, docs_path: str, connector_documentation: str, connector_metadata: dict
+    ):
+        connector_name = connector_metadata["data"]["name"]
+        template_descriptions = self._headings_description(connector_name)
+
+        node = docs_utils.documentation_node(connector_documentation)
+        header_line_map = {docs_utils.header_name(n): n.map[1] for n in node if n.type == self.HEADING}
+        actual_headings = tuple(header_line_map.keys())
+
+        for heading, description in template_descriptions.items():
+            if heading in actual_headings:
+
+                description_start_line = header_line_map[heading]
+                description_end_line = docs_utils.description_end_line_index(heading, actual_headings, header_line_map)
+
+                with open(docs_path, "r") as docs_file, open(description, "r") as template_file:
+
+                    docs_description_content = docs_file.readlines()[description_start_line:description_end_line]
+                    template_description_content = template_file.readlines()
+
+                    for d, t in zip(docs_description_content, template_description_content):
+                        d, t = docs_utils.prepare_lines_to_compare(connector_name, d, t)
+                        assert d == t, f"Description for '{heading}' does not follow structure.\nExpected: {t} Actual: {d}"
+
+    def test_validate_links(self, operational_certification_test, connector_documentation: str):
+        valid_status_codes = [200, 403, 401, 405]  # we skip 4xx due to needed access
+        links = re.findall("(https?://[^\s)]+)", connector_documentation)
+        invalid_links = []
+        threads = []
+
+        def validate_docs_links(docs_link):
+            response = requests.get(docs_link)
+            if response.status_code not in valid_status_codes:
+                invalid_links.append(docs_link)
+
+        for link in links:
+            process = Thread(target=validate_docs_links, args=[link])
+            process.start()
+            threads.append(process)
+
+        for process in threads:
+            process.join(timeout=30)  # 30s timeout for process else link will be skipped
+            process.is_alive()
+
+        assert not invalid_links, f"{len(invalid_links)} invalid links were found in the connector documentation: {invalid_links}."
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@

		1. Navigate to the Airbyte Open Source dashboard.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@

		The {connector_name} source connector supports the following [sync modes](https://docs.airbyte.com/cloud/core-concepts/#connection-sync-modes):
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@

		Now that you have set up the {connector_name} source connector, check out the following {connector_name} tutorials: