"""
This module contains the interfaces and common functionality used by the
Existence of Documentation Infrastructure check.
"""
from __future__ import annotations
import logging
import re
import string
from pathlib import Path
from typing import (
Any,
Dict,
Iterable,
List,
NamedTuple,
Optional,
Tuple,
Literal,
)
from urllib.parse import urlparse
import yaml
from git.repo import Repo
from gitlab.v4.objects import Project
from yaml.parser import ParserError
from yaml.scanner import ScannerError
from src.interfaces import Named
from src.utils import file_list
logger: logging.Logger = logging.getLogger(__name__)
[docs]
class DocumentationTypeInterface(Named):
"""
Abstracts over the different kinds of documentation that a project might
have. The business logic for finding and scoring documentation is in the
implementing classes, this interface is used by the main check class to
compute the final score.
The class also contains some helpers for common operations.
"""
#: used to filter files that are likely not plain text
TEXT_FILE_REGEX: re.Pattern[str] = re.compile(r"\.(md|rst|txt)$")
#: used to find markdown links to documentation
LINK_PATTERN: re.Pattern[str] = re.compile(
r'\[([^(\]\[]*?[dD]oc[su][^(\]\[]*?)\]\((http[^("]*?)\)', re.IGNORECASE
)
#: retuned by methods that collect links to documentation
PubbliccodeymlDocLink = NamedTuple(
"PubbliccodeymlDocLink", [("type", str), ("url", str)]
)
ScrapedDocLink = NamedTuple(
"ScrapedDocLink", [("file", str), ("preview", str), ("url", str)]
)
RM_WHITESPACE_MAP: Dict[int, Literal[None]] = {
ord(c): None for c in string.whitespace
}
[docs]
def __init__(self, repo: Repo, api: Project) -> None:
self.repo: Repo = repo
self.api: Project = api
[docs]
def _is_external_url(self, url: Optional[str]) -> bool:
"""
Checks if a link points to a target outside of OpenCoDE.
:param url: url to decide
:return: True iff the url does not point to OpenCoDE
"""
if url is None:
return False
domain = urlparse(url).netloc
return "opencode" not in domain
[docs]
def _docs_in_publiccodeyml(
self, only_external: bool = False, only_internal: bool = False
) -> List[DocumentationTypeInterface.PubbliccodeymlDocLink]:
"""
Checks if the `publiccode.yaml` exists, and if it does, whether it
contains links to documentation. Optionally returns only links that
point back to the project itself, or only links that point to an URL
outside of OpenCoDE.
:return: Tuples of (documentation type, link target)
for all doc links that were found.
"""
if only_external and only_internal:
raise ValueError(
"Specify either only_internal or only_external, not both"
)
ret: List[DocumentationTypeInterface.PubbliccodeymlDocLink] = []
pcy: Optional[Dict[str, Any]] = self._get_publiccodeyml()
if not pcy:
return ret
desc: Optional[Dict[str, str]] = pcy.get("description")
if not desc:
return ret
for lang, lang_desc in desc.items():
logger.debug(f"Project description in {lang} is {lang_desc}")
if not isinstance(lang_desc, dict):
logger.info(
f"publiccode.yml of {self.api.name_with_namespace} has invalid format: {lang_desc=}"
)
continue
docs: Optional[str] = lang_desc.get("documentation")
if docs:
logging.info(f"Found user documentation: {docs}")
ret.append(self.PubbliccodeymlDocLink("user", docs))
api_docs: Optional[str] = lang_desc.get("apiDocumentation")
if api_docs:
logger.info(f"Found api documentation: {api_docs}")
ret.append(self.PubbliccodeymlDocLink("api", api_docs))
if only_external:
ret = [
doc_link
for doc_link in ret
if self._is_external_url(doc_link.url)
]
elif only_internal:
ret = [
doc_link
for doc_link in ret
if not self._is_external_url(doc_link.url)
]
return ret
[docs]
def _collect_doc_links(
self, only_external: bool = False, only_internal: bool = False
) -> List[DocumentationTypeInterface.ScrapedDocLink]:
"""
Scans some kinds of text files in the repository for links that have
something like `*docs*` in the preview text. Optionally returns only
links that point back to the project itself, or only links that point
to an URL outside of OpenCoDE.
:para only_external: return only links that point to a location outside
of OpenCoDE
:param only_internal: return only links that point back to the project
itself
:return: Tuples of (file name, link preview text, link target)
for all doc links that were found.
"""
if only_external and only_internal:
raise ValueError(
"Specify either only_internal or only_external, not both"
)
files: Iterable[Path] = file_list(
self.repo, file_name_filter=self._text_file_filter
)
ret: List[DocumentationTypeInterface.ScrapedDocLink] = []
for file in files:
try:
content: str = file.read_text()
except UnicodeDecodeError as E:
logger.error(f"Can not decode content of {file.name}: {E}")
continue
link_matches: List[re.Match[str]] = self.LINK_PATTERN.findall(
content
)
if not link_matches:
continue
for link in link_matches:
preview: str = str(link[0])
target: str = str(link[1])
if (
((only_internal or only_external) is False)
or (only_external and self._is_external_url(target))
or (
only_internal
and (self._is_external_url(target) is False)
)
):
triple: DocumentationTypeInterface.ScrapedDocLink = (
self.ScrapedDocLink(file.name, preview, target)
)
logger.info(
f"Found link to {'external' if only_external else ('internal' if only_internal else '')} documentation: {triple}"
)
ret.append(triple)
return ret
[docs]
def _amount(self, files: Iterable[Path]) -> int:
"""
:return: Returns total number on non-whitespace characters in `files`.
"""
ret: int = 0
for file in files:
try:
ret += len(self._remove_whitespace(file.read_text()))
except UnicodeDecodeError as E:
logger.error(f"Can not decode content of {file.name}: {E}")
continue
return ret
[docs]
@classmethod
def _text_file_filter(cls, file_name: str) -> bool:
return (
False
if re.search(cls.TEXT_FILE_REGEX, file_name)
else True
)
[docs]
def _get_publiccodeyml(self) -> Optional[Dict[str, Any]]:
"""
Try to find and parse the projects publiccode.yaml.
:return: a mapping that contains the parsed file
"""
content: Optional[str] = None
try:
content = Path(
str(self.repo.working_tree_dir) + "/publiccode.yml"
).read_text()
except FileNotFoundError:
logger.info("Project has no publiccode.yml")
try:
content = Path(
str(self.repo.working_tree_dir) + "/publiccode.yaml"
).read_text()
except FileNotFoundError:
logger.info("Project has no publiccode.yaml")
try:
if content:
return yaml.safe_load(content)
except (ParserError, ScannerError) as E:
logger.info(f"Project has invalid publiccode.yml: {E}")
return
[docs]
def _remove_whitespace(self, s: str) -> str:
"""
:return: input string with all non-whitespace characters removed
"""
return s.translate(self.RM_WHITESPACE_MAP)
[docs]
def delta(self) -> Tuple[float, int]:
"""
Restriction of the `delta` map to the documentation type represented by
the implementor and the repository specified during the construction
of this instance.
:return: confidence into the result, and amount of documentation
detected
"""
raise NotImplementedError()