Source code for src.checks.sast_usage_basic

"""Implementation of the SastUsageBasic check"""

from __future__ import annotations

import csv
import json
import logging
import re
from collections import defaultdict
from enum import Enum
from pathlib import Path
from typing import Any, Dict, List, Optional, Set

import jsonschema

from src.config import context
from src.interfaces import CheckInterface
from src.exceptions import CheckConstructionException

logger: logging.Logger = logging.getLogger(__name__)


[docs] class SastToolKind(Enum): """ Enumerates the different classes of SAST tools that we differentiate between in our check. Each SastTool has one and it determines "how good" it is if we detect it in a project. """ LINTER = 1 SECURITY = 2 SECRET = 3 SCA = 4
[docs] @classmethod def weight(cls, kind: SastToolKind) -> float: """ Encodes "how good" it is if we detect the tool in a project. """ match kind: case SastToolKind.LINTER: return 0.5 case SastToolKind.SECURITY: return 1 case SastToolKind.SECRET: return 0.5 case SastToolKind.SCA: return 0.5
# Check
[docs] class SastTool: """ Represents a tool that we can hope to detect in a project. """ # Magic values for keys that store regular expressions. They are # automatically transformed to fixed regexes. Here are the ones that are not # parametrized by the tool. The others are constructed in the init function. # # note: empty fields are equivalent to $matchnothing default_special_regex_values: Dict[str, re.Pattern[str]] = { "$PDF_path_re": re.compile(r"\\."), "$PDF_name_re": re.compile(r"^\.gitlab-ci\.yml$"), "$PCH_path_re": re.compile(r"\."), "$PCH_name_re": re.compile(r"precommit"), "$CF_path_re": re.compile(r"\."), "$readme": re.compile(r"(README|[Rr]eadme)"), "$matchall": re.compile(r""), "$matchnothing": re.compile(r"(?!x)x"), "$rootdir": re.compile(r"\\."), } # Default language source file regex values: maps programming languages to # a regular expression that should recognize source files of that language # by name (usually file extension). Combined to form the source_file_regex # of a tool and then compiled. If there is no regex for a language it falls # back to a match-nothing regex. default_language_regex_values: Dict[str, str] = defaultdict( lambda: r"(?!x)x", { "python": r"^.*?\.py$", "dockerfile": r"^Dockerfile$", "shell": r"^.*?\.sh$", "rust": r"^.*?\.rs$", "typescript": r"^.*?\.ts$", "tsx": r"^.*?\.tsx$", "javascript": r"^.*?\.js[x]$", "fluent": r"^.*?\.ftl$", }, )
[docs] def __init__(self, tool_json: Dict[str, Any]): """ :param tool_json: A map describing the tool; can be accessed by indexing the instance; domain is defined by the tool JSON schema """ self.name: str = tool_json["name"] self.kind: SastToolKind = SastToolKind[tool_json["kind"]] self.special_regex_values: Dict[ str, re.Pattern[str] ] = self.default_special_regex_values | { "$PDF_data_re": re.compile(f"{tool_json['name']}"), "$PCH_data_re": re.compile(f"{tool_json['name']}"), } tool_json = self.__compile_regex(tool_json) tool_json = self.__add_source_file_regex(tool_json) self.tool_json: Dict[str, Any] = tool_json
def __add_source_file_regex( self, tool_json: Dict[str, Any] ) -> Dict[str, Any]: """ Adds a `source_file_regex` key to the input map and populates the value with a regex that should match the names of source files of languages the input maps' `languages` array. :return: The updated map """ tool_json["source_file_regex"] = re.compile( "(" + "|".join( [ self.default_language_regex_values[language] for language in tool_json["languages"] ] ) + ")" ) logging.info( f"{self.name} is using {tool_json['source_file_regex']} to find source files" ) return tool_json def __compile_regex(self, tool_json: Dict[str, Any]) -> Dict[str, Any]: """ Replaces string values in the input dict that represent regular expressions with their compiled versions. :return: The updated dict """ non_regex_keys: Set[str] = { "name", "description", "url", "languages", "stars", "applicable", } special_regex_value_prefix: str = "$" for k, v in tool_json.items(): if k in non_regex_keys: continue if v.startswith(special_regex_value_prefix): # replace fields with magic values with their pre-compiled # regexes tool_json[k] = self.special_regex_values[v] elif v == "": # use the match-nothing regex for empty fields tool_json[k] = self.special_regex_values["$matchnothing"] else: if v == ".": v = "\\." tool_json[k] = re.compile(v) return tool_json
[docs] def __getitem__(self, index: str) -> Any: return self.tool_json[index]
[docs] @classmethod def from_file_validate(cls, schema: Dict[str, Any], file: Path) -> SastTool: """ Constructs an instance from a JSON file describing a tool. Validates the file against the expected schema before using it. """ with file.open(mode="r") as f: tool_json: Dict[str, Any] = json.load(f) jsonschema.validate(tool_json, schema) return cls(tool_json)
# pylint: disable-next=too-complex,too-many-return-statements
[docs] def check_file(self, f: Path) -> bool: """ :return: True iff the file 'f' indicates that the SAST tool is being used in the project """ logger.debug(f"Check {self['name']} on {f.name}") if not f.is_file(): return False logstring = f"Detected {self.name} in {f} via strategy " # If the file is a source file of a supported language, see if we can # find tool-specific source code or comment artifacts. if self._check_file(f, self["source_file_regex"], self["SLD"]): logger.info(logstring + "SLD") return True # If the file looks like a tool-specific configuration file report the # tool as being present. Optionally check that the content indicates it # as well. if self._check_file(f, self["TCF_name"], self["TCF_data"]): logger.info(logstring + "TCF") return True # If the file looks like a pipeline/CI/CD/... definition file, check # the content to see if it looks like they run the tool. if self._check_file(f, self["PDF_name"], self["PDF_data"]): logger.info(logstring + "PDF") return True # If the file looks like a pre-commit hook definition file, check # the content to see if it looks like they run the tool. if self._check_file(f, self["PCH_name"], self["PCH_data"]): logger.info(logstring + "PCH") return True # If the file looks like a language-tooling configuration file, check # the content to see if it looks like they configure the tool in it. if self._check_file(f, self["CF_name"], self["CF_data"]): logger.info(logstring + "CF") return True # If the file looks like a Readme, check if they proudly present the # tool's badge in it. if self._check_file(f, self["BDG_name"], self["BDG_data"]): return True return False
[docs] def _check_file( self, f: Path, name_regex: Optional[re.Pattern[str]], content_regex: Optional[re.Pattern[str]] = None, ) -> bool: """ :return: True iff a file's name and contents match the respective regular expressions; if no content regex is supplied only the filename is checked """ if not name_regex or not re.search(name_regex, f.name): return False if not content_regex: logger.info(f"Filename matches {name_regex} and there is no content_regex") return True with f.open(mode="r") as lines: for line in lines: if not re.search(content_regex, line): continue logger.info(f"Filename matches {name_regex} and <{line}> matches {content_regex}") return True return False
@property def weight(self) -> float: """ Not all tools are equally good. Here we supply a rather arbitrary weight to influence how much effect the presence of the tool has on the final score. The higher the weight, the more I like the tool. :return: The weight """ return SastToolKind.weight(self.kind)
[docs] class SastUsageBasic(CheckInterface): exclude: re.Pattern[str] = re.compile("(^.git$|test)")
[docs] def __init__(self, *args: Any, **kwargs: Dict[str, Any]) -> None: super().__init__(*args, **kwargs) self.tool_schema: Dict[str, Any] = self.__load_tool_schema() self.__generate_tools() self.tools: List[SastTool] = self.__load_tools() self.lang_tools: Dict[str, List[SastTool]] = self.__build_lang_tools() if not self.proj.languages(): raise CheckConstructionException("Project conatins no languages?!")
def __load_tool_schema(self) -> Dict[str, Any]: """ Loads the JSON schema of the tool definitions from permanent storage. config: tool_schema :return: JSON schema of a single tool """ schema_path: Path = context.settings[f"{self.name}_tool_schema"] assert schema_path.is_file() logger.info( f"Loading tool definition schema for {self.name} from {schema_path}" ) with schema_path.open(mode="r") as f: return json.load(f) def __generate_tools(self) -> None: """ Generates the individual JSON tool definitions from a CSV file that describes all of them. config: tools_csv effect: populates the directory config::tools_dir note: no-op if directory is not empty """ tools_dir: Path = context.settings[f"{self.name}_tools_dir"] # grepme: Always generate for testing # if len(list(tools_dir.iterdir())) != 0: # return tool_defs: Path = Path(context.settings[f"{self.name}_tools_csv"]) with tool_defs.open(mode="r", encoding="UTF-8") as f: reader = csv.reader(f, delimiter="\t") header: List[str] = next(reader) for row in reader: tool: Dict[str, Any] = dict(zip(header, row)) if tool["applicable"] == 0: continue tool["languages"] = str(tool["languages"]).split(" ") jsonschema.validate(tool, self.tool_schema) tool_json: str = json.dumps(tool) logger.info( f"""Generated tool: {(tools_dir / f"{tool['name']}.json").as_posix()}""" ) (tools_dir / f"{tool['name']}.json").write_text(tool_json) def __load_tools(self) -> List[SastTool]: """ Loads the JSON tool definitions from permanent storage. config: tools_dir :return: list of tools """ tools_dir: Path = context.settings[f"{self.name}_tools_dir"] assert tools_dir.is_dir() tools: List[SastTool] = [ SastTool.from_file_validate(self.tool_schema, file) for file in tools_dir.iterdir() ] logger.info(f"Loaded tools {[t['name'] for t in tools]}") return tools def __build_lang_tools(self) -> Dict[str, List[SastTool]]: """ Constructs a mapping from programming languages to tools :return: The mapping """ mapping: Dict[str, List[SastTool]] = defaultdict(list) for tool in self.tools: for language in tool["languages"]: mapping[language].append(tool) logging.info( f"Built mapping: { {l: [t['name'] for t in tools] for l, tools in mapping.items()} }" ) return mapping
[docs] def _detect_sast_tools( self, ) -> Dict[str, List[SastTool]]: """ Performs the actual "analysis". Builds map that takes programming languages to the set of SAST tools that the project uses for this language. :return: The mapping """ detected_tools: Dict[str, List[SastTool]] = defaultdict(list) for lang, tools in self.lang_tools.items(): if lang not in map(lambda s: s.lower(), dict(self.proj.languages()).keys()): continue for f in self._gen_file_list(): if not tools: break for tool in tools: if tool.check_file(f): logger.info(f"Removing tool {tool.name} for lanugage {lang}") detected_tools[lang].append(tool) tools.remove(tool) logger.info( "Detected SAST tools " f"{[(lang, [tool.name for tool in tools]) for lang, tools in detected_tools.items()]}" f" for project {self.proj.id}." ) return detected_tools
[docs] def _calc_score(self, detected_tools: Dict[str, List[SastTool]]) -> float: """ Consumes the result of`_detect_sast_tools` and calculates the final score out of it. :return: score """ score: float = 0.0 for lang, lweight in dict(self.proj.languages()).items(): lang: str = str(lang) lweight: float = float(lweight) / 100 logging.info(f"Language {lang} has weight {lweight}") tweight: float = 0.0 for tool in detected_tools[lang.lower()]: tweight = max(tweight, tool.weight) logging.info(f"Maximum tool weight is {tweight}") score += float(lweight) * tweight return score
[docs] def run(self, args_dict: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: ret: Dict[str, Any] = super().run(args_dict) detected_tools: Dict[str, List[SastTool]] = self._detect_sast_tools() results: Dict[str, Any] = { "lang_tools": [ [lang, [tool.name for tool in tools]] for lang, tools in detected_tools.items() ], } assert self.results_valid(results) return { "score": self._calc_score(detected_tools), "results": results, } | ret