Source code for src.checks.comments_in_code
"""Implementation of the "Comments in Code" check"""
from typing import Any, Dict, Optional, Set
from pathlib import Path
from collections import defaultdict
import logging
import shutil
import subprocess as sp
import json
from src.interfaces import CheckInterface
from src.exceptions import (
CheckConstructionException,
)
logger: logging.Logger = logging.getLogger(__name__)
[docs]
class CommentsInCode(CheckInterface):
"""
Implementation of the "Comments in Code" check
This check essentially just runs `tokei` and divides comment lines by
combined comment and code lines. There is some additional logic to handle
programming languages.
"""
[docs]
def __init__(self, *args: Any, **kwargs: Dict[str, Any]) -> None:
super().__init__(*args, **kwargs)
self.tokei_to_linguist: Dict[
str, Optional[str]
] = self.__load_tokei_to_linguist()
self.l_check: Set[str] = self.__compute_l_check(self.tokei_to_linguist)
self.linguist: Dict[str, float] = self.__fetch_linguist()
self.tokei: Dict[str, Dict[str, int]] = self.__run_tokei()
# I know this step is redundant but I do it anyways!
self.l_repo: Set[str] = self.__compute_l_repo(
# for L(r) we only use what tokei and linguist detected
set(self.tokei.keys()).intersection(set(self.linguist.keys())),
self.l_check,
)
if not self.l_repo:
raise CheckConstructionException(
"Project contains no language supported by this check"
)
if not self.__have_tokei():
raise CheckConstructionException(
"Can not find tokei executable on PATH"
)
def __load_tokei_to_linguist(self) -> Dict[str, Optional[str]]:
# generate with
# https://gitlab.opencode.de/OC000014832448/tokei_to_linguist
with (self._get_resource_dir() / "tokei_to_linguist.json").open() as f:
return json.load(f)
def __compute_l_check(
self, tokei_to_linguist: Dict[str, Optional[str]]
) -> Set[str]:
"""
Compute L_check via relation L_check = Im(A) \\ {None}
"""
return set(str(x) for x in tokei_to_linguist.values() if x)
def __have_tokei(self) -> bool:
if shutil.which("tokei"):
return True
return False
def __fetch_linguist(self) -> Dict[str, float]:
return {
str(k).lower(): float(v) for k, v in self.proj.languages().items()
}
def __compute_l_repo(self, l_of_r: Set[str], l_check: Set[str]) -> Set[str]:
return l_of_r.intersection(l_check)
def __run_tokei(self) -> Dict[str, Dict[str, int]]:
raw: sp.CompletedProcess[bytes] = sp.run(
["tokei", "-o", "json"],
capture_output=True,
check=True,
cwd=Path(str(self.repo.working_dir)),
)
tokei_results: Dict[str, Dict[str, int]] = defaultdict(
lambda: {
"code": 0,
"comments": 0,
}
)
for lang, stats in json.loads(raw.stdout).items():
# restricts the keys of self.tokei to L_linguist, and thus also to
# L_check (and since it comes from analyzing the repo, also L(r)),
# i.e., we compute A(l) and skip None
# grepme: change this to indexing once the bug in the script is
# fixed
lang: Optional[str] = self.tokei_to_linguist.get(str(lang).lower())
if not lang:
continue
# The map A is not injective, thus we might need to update an
# existing entry, e.g., C headers and C source files both count
# to C language
tokei_results[lang]["code"] += int(stats["code"])
tokei_results[lang]["comments"] += int(stats["comments"])
return tokei_results
[docs]
def _tokei(self, lang: str) -> float:
"""
Map that takes a language in L_repo to its comments to code ratio
"""
ncomments: int = self.tokei[lang]["comments"]
ncode: int = self.tokei[lang]["code"]
return ncomments / (ncode + ncomments)
[docs]
def _sigma(self, value: float) -> float:
"""
Scoring function that receives the average comments to code ratio as an
input and maps it to the final score.
Needed since we cannot expect a project to be 100% comments to receive
a perfect score
"""
slope: int = 10
return value * slope if value < 1 / slope else 1.0
[docs]
def _compute_tokei(self) -> Dict[str, float]:
"""
:return: The computed `tokei` map
"""
lang_ratios: Dict[str, float] = {}
for lang in self.l_repo:
ratio: float = self._tokei(lang)
logger.info(f"C2C: {lang} - {ratio}")
lang_ratios |= {lang: ratio}
return lang_ratios
[docs]
def _compute_score(self, lang_ratios: Dict[str, float]) -> float:
return self._sigma(sum(
lang_ratios[lang] * self.linguist[lang] for lang in self.l_repo
) / sum(self.linguist[lang] for lang in self.l_repo))
[docs]
def run(self, args_dict: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
ret: Dict[str, Any] = super().run(args_dict)
lang_ratios: Dict[str, float] = self._compute_tokei()
return {
"score": self._compute_score(lang_ratios),
"results": lang_ratios,
} | ret