Source code for src.checks.sast_usage_basic

"""Implementation of the SastUsageBasic check"""

from __future__ import annotations

import csv
import json
import logging
import re
from collections import defaultdict
from enum import Enum
from pathlib import Path
from typing import Any, Dict, List, Optional, Set

import jsonschema

from src.config import context
from src.interfaces import CheckInterface
from src.exceptions import CheckConstructionException

logger: logging.Logger = logging.getLogger(__name__)



[docs]
class SastToolKind(Enum):
    """
    Enumerates the different classes of SAST tools that we differentiate between
    in our check. Each SastTool has one and it determines "how good" it is if
    we detect it in a project.
    """

    LINTER = 1
    SECURITY = 2
    SECRET = 3
    SCA = 4


[docs]
    @classmethod
    def weight(cls, kind: SastToolKind) -> float:
        """
        Encodes "how good" it is if we detect the tool in a project.
        """
        match kind:
            case SastToolKind.LINTER:
                return 0.5
            case SastToolKind.SECURITY:
                return 1
            case SastToolKind.SECRET:
                return 0.5
            case SastToolKind.SCA:
                return 0.5




# Check



[docs]
class SastTool:
    """
    Represents a tool that we can hope to detect in a project.
    """

    # Magic values for keys that store regular expressions. They are
    # automatically transformed to fixed regexes. Here are the ones that are not
    # parametrized by the tool. The others are constructed in the init function.
    #
    # note: empty fields are equivalent to $matchnothing
    default_special_regex_values: Dict[str, re.Pattern[str]] = {
        "$PDF_path_re": re.compile(r"\\."),
        "$PDF_name_re": re.compile(r"^\.gitlab-ci\.yml$"),
        "$PCH_path_re": re.compile(r"\."),
        "$PCH_name_re": re.compile(r"precommit"),
        "$CF_path_re": re.compile(r"\."),
        "$readme": re.compile(r"(README|[Rr]eadme)"),
        "$matchall": re.compile(r""),
        "$matchnothing": re.compile(r"(?!x)x"),
        "$rootdir": re.compile(r"\\."),
    }
    # Default language source file regex values: maps programming languages to
    # a regular expression that should recognize source files of that language
    # by name (usually file extension). Combined to form the source_file_regex
    # of a tool and then compiled. If there is no regex for a language it falls
    # back to a match-nothing regex.
    default_language_regex_values: Dict[str, str] = defaultdict(
        lambda: r"(?!x)x",
        {
            "python": r"^.*?\.py$",
            "dockerfile": r"^Dockerfile$",
            "shell": r"^.*?\.sh$",
            "rust": r"^.*?\.rs$",
            "typescript": r"^.*?\.ts$",
            "tsx": r"^.*?\.tsx$",
            "javascript": r"^.*?\.js[x]$",
            "fluent": r"^.*?\.ftl$",
        },
    )


[docs]
    def __init__(self, tool_json: Dict[str, Any]):
        """
        :param tool_json: A map describing the tool; can be accessed by indexing the instance; domain is defined by the tool JSON schema
        """
        self.name: str = tool_json["name"]
        self.kind: SastToolKind = SastToolKind[tool_json["kind"]]
        self.special_regex_values: Dict[
            str, re.Pattern[str]
        ] = self.default_special_regex_values | {
            "$PDF_data_re": re.compile(f"{tool_json['name']}"),
            "$PCH_data_re": re.compile(f"{tool_json['name']}"),
        }
        tool_json = self.__compile_regex(tool_json)
        tool_json = self.__add_source_file_regex(tool_json)
        self.tool_json: Dict[str, Any] = tool_json


    def __add_source_file_regex(
        self, tool_json: Dict[str, Any]
    ) -> Dict[str, Any]:
        """
        Adds a `source_file_regex` key to the input map and populates the value
        with a regex that should match the names of source files of languages
        the input maps' `languages` array.

        :return: The updated map
        """
        tool_json["source_file_regex"] = re.compile(
            "("
            + "|".join(
                [
                    self.default_language_regex_values[language]
                    for language in tool_json["languages"]
                ]
            )
            + ")"
        )

        logging.info(
            f"{self.name} is using {tool_json['source_file_regex']} to find source files"
        )

        return tool_json

    def __compile_regex(self, tool_json: Dict[str, Any]) -> Dict[str, Any]:
        """
        Replaces string values in the input dict that represent regular
        expressions with their compiled versions.

        :return: The updated dict
        """
        non_regex_keys: Set[str] = {
            "name",
            "description",
            "url",
            "languages",
            "stars",
            "applicable",
        }
        special_regex_value_prefix: str = "$"

        for k, v in tool_json.items():
            if k in non_regex_keys:
                continue

            if v.startswith(special_regex_value_prefix):
                # replace fields with magic values with their pre-compiled
                # regexes
                tool_json[k] = self.special_regex_values[v]
            elif v == "":
                # use the match-nothing regex for empty fields
                tool_json[k] = self.special_regex_values["$matchnothing"]
            else:
                if v == ".":
                    v = "\\."
                tool_json[k] = re.compile(v)

        return tool_json


[docs]
    def __getitem__(self, index: str) -> Any:
        return self.tool_json[index]



[docs]
    @classmethod
    def from_file_validate(cls, schema: Dict[str, Any], file: Path) -> SastTool:
        """
        Constructs an instance from a JSON file describing a tool. Validates the
        file against the expected schema before using it.
        """
        with file.open(mode="r") as f:
            tool_json: Dict[str, Any] = json.load(f)
            jsonschema.validate(tool_json, schema)
        return cls(tool_json)


    # pylint: disable-next=too-complex,too-many-return-statements

[docs]
    def check_file(self, f: Path) -> bool:
        """
        :return: True iff the file 'f' indicates that the SAST tool is being used in the project
        """
        logger.debug(f"Check {self['name']} on {f.name}")
        if not f.is_file():
            return False

        logstring = f"Detected {self.name} in {f} via strategy "
        # If the file is a source file of a supported language, see if we can
        # find tool-specific source code or comment artifacts.
        if self._check_file(f, self["source_file_regex"], self["SLD"]):
            logger.info(logstring + "SLD")
            return True

        # If the file looks like a tool-specific configuration file report the
        # tool as being present. Optionally check that the content indicates it
        # as well.
        if self._check_file(f, self["TCF_name"], self["TCF_data"]):
            logger.info(logstring + "TCF")
            return True

        # If the file looks like a pipeline/CI/CD/... definition file, check
        # the content to see if it looks like they run the tool.
        if self._check_file(f, self["PDF_name"], self["PDF_data"]):
            logger.info(logstring + "PDF")
            return True

        # If the file looks like a pre-commit hook definition file, check
        # the content to see if it looks like they run the tool.
        if self._check_file(f, self["PCH_name"], self["PCH_data"]):
            logger.info(logstring + "PCH")
            return True

        # If the file looks like a language-tooling configuration file, check
        # the content to see if it looks like they configure the tool in it.
        if self._check_file(f, self["CF_name"], self["CF_data"]):
            logger.info(logstring + "CF")
            return True

        # If the file looks like a Readme, check if they proudly present the
        # tool's badge in it.
        if self._check_file(f, self["BDG_name"], self["BDG_data"]):
            return True

        return False



[docs]
    def _check_file(
        self,
        f: Path,
        name_regex: Optional[re.Pattern[str]],
        content_regex: Optional[re.Pattern[str]] = None,
    ) -> bool:
        """
        :return: True iff a file's name and contents match the respective regular expressions; if no content regex is supplied only the filename is checked
        """
        if not name_regex or not re.search(name_regex, f.name):
            return False
        if not content_regex:
            logger.info(f"Filename matches {name_regex} and there is no content_regex")
            return True

        with f.open(mode="r") as lines:
            for line in lines:
                if not re.search(content_regex, line):
                    continue
                logger.info(f"Filename matches {name_regex} and <{line}> matches {content_regex}")
                return True

        return False


    @property
    def weight(self) -> float:
        """
        Not all tools are equally good. Here we supply a rather arbitrary weight
        to influence how much effect the presence of the tool has on the final
        score. The higher the weight, the more I like the tool.

        :return: The weight
        """
        return SastToolKind.weight(self.kind)




[docs]
class SastUsageBasic(CheckInterface):
    exclude: re.Pattern[str] = re.compile("(^.git$|test)")


[docs]
    def __init__(self, *args: Any, **kwargs: Dict[str, Any]) -> None:
        super().__init__(*args, **kwargs)

        self.tool_schema: Dict[str, Any] = self.__load_tool_schema()
        self.__generate_tools()
        self.tools: List[SastTool] = self.__load_tools()
        self.lang_tools: Dict[str, List[SastTool]] = self.__build_lang_tools()

        if not self.proj.languages():
            raise CheckConstructionException("Project conatins no languages?!")


    def __load_tool_schema(self) -> Dict[str, Any]:
        """
        Loads the JSON schema of the tool definitions from permanent storage.
        config: tool_schema

        :return: JSON schema of a single tool
        """
        schema_path: Path = context.settings[f"{self.name}_tool_schema"]
        assert schema_path.is_file()
        logger.info(
            f"Loading tool definition schema for {self.name} from {schema_path}"
        )
        with schema_path.open(mode="r") as f:
            return json.load(f)

    def __generate_tools(self) -> None:
        """
        Generates the individual JSON tool definitions from a CSV file that
        describes all of them.

        config: tools_csv

        effect: populates the directory config::tools_dir

        note: no-op if directory is not empty
        """
        tools_dir: Path = context.settings[f"{self.name}_tools_dir"]

        # grepme: Always generate for testing
        # if len(list(tools_dir.iterdir())) != 0:
        #    return

        tool_defs: Path = Path(context.settings[f"{self.name}_tools_csv"])

        with tool_defs.open(mode="r", encoding="UTF-8") as f:
            reader = csv.reader(f, delimiter="\t")
            header: List[str] = next(reader)
            for row in reader:
                tool: Dict[str, Any] = dict(zip(header, row))
                if tool["applicable"] == 0:
                    continue

                tool["languages"] = str(tool["languages"]).split(" ")

                jsonschema.validate(tool, self.tool_schema)
                tool_json: str = json.dumps(tool)
                logger.info(
                    f"""Generated tool: {(tools_dir / f"{tool['name']}.json").as_posix()}"""
                )
                (tools_dir / f"{tool['name']}.json").write_text(tool_json)

    def __load_tools(self) -> List[SastTool]:
        """
        Loads the JSON tool definitions from permanent storage.

        config: tools_dir

        :return: list of tools
        """
        tools_dir: Path = context.settings[f"{self.name}_tools_dir"]
        assert tools_dir.is_dir()

        tools: List[SastTool] = [
            SastTool.from_file_validate(self.tool_schema, file)
            for file in tools_dir.iterdir()
        ]

        logger.info(f"Loaded tools {[t['name'] for t in tools]}")

        return tools

    def __build_lang_tools(self) -> Dict[str, List[SastTool]]:
        """
        Constructs a mapping from programming languages to tools
        :return: The mapping
        """
        mapping: Dict[str, List[SastTool]] = defaultdict(list)

        for tool in self.tools:
            for language in tool["languages"]:
                mapping[language].append(tool)

        logging.info(
            f"Built mapping: { {l: [t['name'] for t in tools] for l, tools in mapping.items()} }"
        )

        return mapping


[docs]
    def _detect_sast_tools(
        self,
    ) -> Dict[str, List[SastTool]]:
        """
        Performs the actual "analysis". Builds map that takes programming
        languages to the set of SAST tools that the project uses for this
        language.

        :return: The mapping
        """
        detected_tools: Dict[str, List[SastTool]] = defaultdict(list)
        for lang, tools in self.lang_tools.items():
            if lang not in map(lambda s: s.lower(), dict(self.proj.languages()).keys()):
                continue
            for f in self._gen_file_list():
                if not tools:
                    break
                for tool in tools:
                    if tool.check_file(f):
                        logger.info(f"Removing tool {tool.name} for lanugage {lang}")
                        detected_tools[lang].append(tool)
                        tools.remove(tool)

        logger.info(
            "Detected SAST tools "
            f"{[(lang, [tool.name for tool in tools]) for lang, tools in detected_tools.items()]}"
            f" for project {self.proj.id}."
        )
        return detected_tools



[docs]
    def _calc_score(self, detected_tools: Dict[str, List[SastTool]]) -> float:
        """
        Consumes the result of`_detect_sast_tools` and calculates the final
        score out of it.

        :return: score
        """
        score: float = 0.0
        for lang, lweight in dict(self.proj.languages()).items():
            lang: str = str(lang)
            lweight: float = float(lweight) / 100
            logging.info(f"Language {lang} has weight {lweight}")
            tweight: float = 0.0
            for tool in detected_tools[lang.lower()]:
                tweight = max(tweight, tool.weight)
            logging.info(f"Maximum tool weight is {tweight}")
            score += float(lweight) * tweight

        return score



[docs]
    def run(self, args_dict: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
        ret: Dict[str, Any] = super().run(args_dict)
        detected_tools: Dict[str, List[SastTool]] = self._detect_sast_tools()
        results: Dict[str, Any] = {
            "lang_tools": [
                [lang, [tool.name for tool in tools]]
                for lang, tools in detected_tools.items()
            ],
        }
        assert self.results_valid(results)
        return {
            "score": self._calc_score(detected_tools),
            "results": results,
        } | ret
Source code for src.checks.sast_usage_basic

occmd

Navigation

Related Topics