Source code for src.checks.comments_in_code

"""Implementation of the "Comments in Code" check"""
from typing import Any, Dict, Optional, Set
from pathlib import Path
from collections import defaultdict
import logging
import shutil
import subprocess as sp
import json

from src.interfaces import CheckInterface
from src.exceptions import (
    CheckConstructionException,
)

logger: logging.Logger = logging.getLogger(__name__)



[docs]
class CommentsInCode(CheckInterface):
    """
    Implementation of the "Comments in Code" check

    This check essentially just runs `tokei` and divides comment lines by
    combined comment and code lines. There is some additional logic to handle
    programming languages.
    """


[docs]
    def __init__(self, *args: Any, **kwargs: Dict[str, Any]) -> None:
        super().__init__(*args, **kwargs)

        self.tokei_to_linguist: Dict[
            str, Optional[str]
        ] = self.__load_tokei_to_linguist()
        self.l_check: Set[str] = self.__compute_l_check(self.tokei_to_linguist)
        self.linguist: Dict[str, float] = self.__fetch_linguist()
        self.tokei: Dict[str, Dict[str, int]] = self.__run_tokei()
        # I know this step is redundant but I do it anyways!
        self.l_repo: Set[str] = self.__compute_l_repo(
            # for L(r) we only use what tokei and linguist detected
            set(self.tokei.keys()).intersection(set(self.linguist.keys())),
            self.l_check,
        )

        if not self.l_repo:
            raise CheckConstructionException(
                "Project contains no language supported by this check"
            )

        if not self.__have_tokei():
            raise CheckConstructionException(
                "Can not find tokei executable on PATH"
            )


    def __load_tokei_to_linguist(self) -> Dict[str, Optional[str]]:
        # generate with
        # https://gitlab.opencode.de/OC000014832448/tokei_to_linguist
        with (self._get_resource_dir() / "tokei_to_linguist.json").open() as f:
            return json.load(f)

    def __compute_l_check(
        self, tokei_to_linguist: Dict[str, Optional[str]]
    ) -> Set[str]:
        """
        Compute L_check via relation L_check = Im(A) \\ {None}
        """
        return set(str(x) for x in tokei_to_linguist.values() if x)

    def __have_tokei(self) -> bool:
        if shutil.which("tokei"):
            return True
        return False

    def __fetch_linguist(self) -> Dict[str, float]:
        return {
            str(k).lower(): float(v) for k, v in self.proj.languages().items()
        }

    def __compute_l_repo(self, l_of_r: Set[str], l_check: Set[str]) -> Set[str]:
        return l_of_r.intersection(l_check)

    def __run_tokei(self) -> Dict[str, Dict[str, int]]:
        raw: sp.CompletedProcess[bytes] = sp.run(
            ["tokei", "-o", "json"],
            capture_output=True,
            check=True,
            cwd=Path(str(self.repo.working_dir)),
        )
        tokei_results: Dict[str, Dict[str, int]] = defaultdict(
            lambda: {
                "code": 0,
                "comments": 0,
            }
        )

        for lang, stats in json.loads(raw.stdout).items():
            # restricts the keys of self.tokei to L_linguist, and thus also to
            # L_check (and since it comes from analyzing the repo, also L(r)),
            # i.e., we compute A(l) and skip None
            # grepme: change this to indexing once the bug in the script is
            #   fixed
            lang: Optional[str] = self.tokei_to_linguist.get(str(lang).lower())
            if not lang:
                continue

            # The map A is not injective, thus we might need to update an
            # existing entry, e.g., C headers and C source files both count
            # to C language
            tokei_results[lang]["code"] += int(stats["code"])
            tokei_results[lang]["comments"] += int(stats["comments"])

        return tokei_results


[docs]
    def _tokei(self, lang: str) -> float:
        """
        Map that takes a language in L_repo to its comments to code ratio
        """

        ncomments: int = self.tokei[lang]["comments"]
        ncode: int = self.tokei[lang]["code"]

        return ncomments / (ncode + ncomments)



[docs]
    def _sigma(self, value: float) -> float:
        """
        Scoring function that receives the average comments to code ratio as an
        input and maps it to the final score.

        Needed since we cannot expect a project to be 100% comments to receive
        a perfect score
        """
        slope: int = 10
        return value * slope if value < 1 / slope else 1.0



[docs]
    def _compute_tokei(self) -> Dict[str, float]:
        """
        :return: The computed `tokei` map
        """
        lang_ratios: Dict[str, float] = {}

        for lang in self.l_repo:
            ratio: float = self._tokei(lang)
            logger.info(f"C2C: {lang} - {ratio}")

            lang_ratios |= {lang: ratio}

        return lang_ratios



[docs]
    def _compute_score(self, lang_ratios: Dict[str, float]) -> float:
        return self._sigma(sum(
            lang_ratios[lang] * self.linguist[lang] for lang in self.l_repo
        ) / sum(self.linguist[lang] for lang in self.l_repo))



[docs]
    def run(self, args_dict: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
        ret: Dict[str, Any] = super().run(args_dict)
        lang_ratios: Dict[str, float] = self._compute_tokei()

        return {
            "score": self._compute_score(lang_ratios),
            "results": lang_ratios,
        } | ret
Source code for src.checks.comments_in_code

occmd

Navigation

Related Topics