Source code for src.checks.sast_usage_basic
"""Implementation of the SastUsageBasic check"""
from __future__ import annotations
import csv
import json
import logging
import re
from collections import defaultdict
from enum import Enum
from pathlib import Path
from typing import Any, Dict, List, Optional, Set
import jsonschema
from src.config import context
from src.interfaces import CheckInterface
from src.exceptions import CheckConstructionException
logger: logging.Logger = logging.getLogger(__name__)
[docs]
class SastToolKind(Enum):
"""
Enumerates the different classes of SAST tools that we differentiate between
in our check. Each SastTool has one and it determines "how good" it is if
we detect it in a project.
"""
LINTER = 1
SECURITY = 2
SECRET = 3
SCA = 4
[docs]
@classmethod
def weight(cls, kind: SastToolKind) -> float:
"""
Encodes "how good" it is if we detect the tool in a project.
"""
match kind:
case SastToolKind.LINTER:
return 0.5
case SastToolKind.SECURITY:
return 1
case SastToolKind.SECRET:
return 0.5
case SastToolKind.SCA:
return 0.5
# Check
[docs]
class SastTool:
"""
Represents a tool that we can hope to detect in a project.
"""
# Magic values for keys that store regular expressions. They are
# automatically transformed to fixed regexes. Here are the ones that are not
# parametrized by the tool. The others are constructed in the init function.
#
# note: empty fields are equivalent to $matchnothing
default_special_regex_values: Dict[str, re.Pattern[str]] = {
"$PDF_path_re": re.compile(r"\\."),
"$PDF_name_re": re.compile(r"^\.gitlab-ci\.yml$"),
"$PCH_path_re": re.compile(r"\."),
"$PCH_name_re": re.compile(r"precommit"),
"$CF_path_re": re.compile(r"\."),
"$readme": re.compile(r"(README|[Rr]eadme)"),
"$matchall": re.compile(r""),
"$matchnothing": re.compile(r"(?!x)x"),
"$rootdir": re.compile(r"\\."),
}
# Default language source file regex values: maps programming languages to
# a regular expression that should recognize source files of that language
# by name (usually file extension). Combined to form the source_file_regex
# of a tool and then compiled. If there is no regex for a language it falls
# back to a match-nothing regex.
default_language_regex_values: Dict[str, str] = defaultdict(
lambda: r"(?!x)x",
{
"python": r"^.*?\.py$",
"dockerfile": r"^Dockerfile$",
"shell": r"^.*?\.sh$",
"rust": r"^.*?\.rs$",
"typescript": r"^.*?\.ts$",
"tsx": r"^.*?\.tsx$",
"javascript": r"^.*?\.js[x]$",
"fluent": r"^.*?\.ftl$",
},
)
[docs]
def __init__(self, tool_json: Dict[str, Any]):
"""
:param tool_json: A map describing the tool; can be accessed by indexing the instance; domain is defined by the tool JSON schema
"""
self.name: str = tool_json["name"]
self.kind: SastToolKind = SastToolKind[tool_json["kind"]]
self.special_regex_values: Dict[
str, re.Pattern[str]
] = self.default_special_regex_values | {
"$PDF_data_re": re.compile(f"{tool_json['name']}"),
"$PCH_data_re": re.compile(f"{tool_json['name']}"),
}
tool_json = self.__compile_regex(tool_json)
tool_json = self.__add_source_file_regex(tool_json)
self.tool_json: Dict[str, Any] = tool_json
def __add_source_file_regex(
self, tool_json: Dict[str, Any]
) -> Dict[str, Any]:
"""
Adds a `source_file_regex` key to the input map and populates the value
with a regex that should match the names of source files of languages
the input maps' `languages` array.
:return: The updated map
"""
tool_json["source_file_regex"] = re.compile(
"("
+ "|".join(
[
self.default_language_regex_values[language]
for language in tool_json["languages"]
]
)
+ ")"
)
logging.info(
f"{self.name} is using {tool_json['source_file_regex']} to find source files"
)
return tool_json
def __compile_regex(self, tool_json: Dict[str, Any]) -> Dict[str, Any]:
"""
Replaces string values in the input dict that represent regular
expressions with their compiled versions.
:return: The updated dict
"""
non_regex_keys: Set[str] = {
"name",
"description",
"url",
"languages",
"stars",
"applicable",
}
special_regex_value_prefix: str = "$"
for k, v in tool_json.items():
if k in non_regex_keys:
continue
if v.startswith(special_regex_value_prefix):
# replace fields with magic values with their pre-compiled
# regexes
tool_json[k] = self.special_regex_values[v]
elif v == "":
# use the match-nothing regex for empty fields
tool_json[k] = self.special_regex_values["$matchnothing"]
else:
if v == ".":
v = "\\."
tool_json[k] = re.compile(v)
return tool_json
[docs]
@classmethod
def from_file_validate(cls, schema: Dict[str, Any], file: Path) -> SastTool:
"""
Constructs an instance from a JSON file describing a tool. Validates the
file against the expected schema before using it.
"""
with file.open(mode="r") as f:
tool_json: Dict[str, Any] = json.load(f)
jsonschema.validate(tool_json, schema)
return cls(tool_json)
# pylint: disable-next=too-complex,too-many-return-statements
[docs]
def check_file(self, f: Path) -> bool:
"""
:return: True iff the file 'f' indicates that the SAST tool is being used in the project
"""
logger.debug(f"Check {self['name']} on {f.name}")
if not f.is_file():
return False
logstring = f"Detected {self.name} in {f} via strategy "
# If the file is a source file of a supported language, see if we can
# find tool-specific source code or comment artifacts.
if self._check_file(f, self["source_file_regex"], self["SLD"]):
logger.info(logstring + "SLD")
return True
# If the file looks like a tool-specific configuration file report the
# tool as being present. Optionally check that the content indicates it
# as well.
if self._check_file(f, self["TCF_name"], self["TCF_data"]):
logger.info(logstring + "TCF")
return True
# If the file looks like a pipeline/CI/CD/... definition file, check
# the content to see if it looks like they run the tool.
if self._check_file(f, self["PDF_name"], self["PDF_data"]):
logger.info(logstring + "PDF")
return True
# If the file looks like a pre-commit hook definition file, check
# the content to see if it looks like they run the tool.
if self._check_file(f, self["PCH_name"], self["PCH_data"]):
logger.info(logstring + "PCH")
return True
# If the file looks like a language-tooling configuration file, check
# the content to see if it looks like they configure the tool in it.
if self._check_file(f, self["CF_name"], self["CF_data"]):
logger.info(logstring + "CF")
return True
# If the file looks like a Readme, check if they proudly present the
# tool's badge in it.
if self._check_file(f, self["BDG_name"], self["BDG_data"]):
return True
return False
[docs]
def _check_file(
self,
f: Path,
name_regex: Optional[re.Pattern[str]],
content_regex: Optional[re.Pattern[str]] = None,
) -> bool:
"""
:return: True iff a file's name and contents match the respective regular expressions; if no content regex is supplied only the filename is checked
"""
if not name_regex or not re.search(name_regex, f.name):
return False
if not content_regex:
logger.info(f"Filename matches {name_regex} and there is no content_regex")
return True
with f.open(mode="r") as lines:
for line in lines:
if not re.search(content_regex, line):
continue
logger.info(f"Filename matches {name_regex} and <{line}> matches {content_regex}")
return True
return False
@property
def weight(self) -> float:
"""
Not all tools are equally good. Here we supply a rather arbitrary weight
to influence how much effect the presence of the tool has on the final
score. The higher the weight, the more I like the tool.
:return: The weight
"""
return SastToolKind.weight(self.kind)
[docs]
class SastUsageBasic(CheckInterface):
exclude: re.Pattern[str] = re.compile("(^.git$|test)")
[docs]
def __init__(self, *args: Any, **kwargs: Dict[str, Any]) -> None:
super().__init__(*args, **kwargs)
self.tool_schema: Dict[str, Any] = self.__load_tool_schema()
self.__generate_tools()
self.tools: List[SastTool] = self.__load_tools()
self.lang_tools: Dict[str, List[SastTool]] = self.__build_lang_tools()
if not self.proj.languages():
raise CheckConstructionException("Project conatins no languages?!")
def __load_tool_schema(self) -> Dict[str, Any]:
"""
Loads the JSON schema of the tool definitions from permanent storage.
config: tool_schema
:return: JSON schema of a single tool
"""
schema_path: Path = context.settings[f"{self.name}_tool_schema"]
assert schema_path.is_file()
logger.info(
f"Loading tool definition schema for {self.name} from {schema_path}"
)
with schema_path.open(mode="r") as f:
return json.load(f)
def __generate_tools(self) -> None:
"""
Generates the individual JSON tool definitions from a CSV file that
describes all of them.
config: tools_csv
effect: populates the directory config::tools_dir
note: no-op if directory is not empty
"""
tools_dir: Path = context.settings[f"{self.name}_tools_dir"]
# grepme: Always generate for testing
# if len(list(tools_dir.iterdir())) != 0:
# return
tool_defs: Path = Path(context.settings[f"{self.name}_tools_csv"])
with tool_defs.open(mode="r", encoding="UTF-8") as f:
reader = csv.reader(f, delimiter="\t")
header: List[str] = next(reader)
for row in reader:
tool: Dict[str, Any] = dict(zip(header, row))
if tool["applicable"] == 0:
continue
tool["languages"] = str(tool["languages"]).split(" ")
jsonschema.validate(tool, self.tool_schema)
tool_json: str = json.dumps(tool)
logger.info(
f"""Generated tool: {(tools_dir / f"{tool['name']}.json").as_posix()}"""
)
(tools_dir / f"{tool['name']}.json").write_text(tool_json)
def __load_tools(self) -> List[SastTool]:
"""
Loads the JSON tool definitions from permanent storage.
config: tools_dir
:return: list of tools
"""
tools_dir: Path = context.settings[f"{self.name}_tools_dir"]
assert tools_dir.is_dir()
tools: List[SastTool] = [
SastTool.from_file_validate(self.tool_schema, file)
for file in tools_dir.iterdir()
]
logger.info(f"Loaded tools {[t['name'] for t in tools]}")
return tools
def __build_lang_tools(self) -> Dict[str, List[SastTool]]:
"""
Constructs a mapping from programming languages to tools
:return: The mapping
"""
mapping: Dict[str, List[SastTool]] = defaultdict(list)
for tool in self.tools:
for language in tool["languages"]:
mapping[language].append(tool)
logging.info(
f"Built mapping: { {l: [t['name'] for t in tools] for l, tools in mapping.items()} }"
)
return mapping
[docs]
def _detect_sast_tools(
self,
) -> Dict[str, List[SastTool]]:
"""
Performs the actual "analysis". Builds map that takes programming
languages to the set of SAST tools that the project uses for this
language.
:return: The mapping
"""
detected_tools: Dict[str, List[SastTool]] = defaultdict(list)
for lang, tools in self.lang_tools.items():
if lang not in map(lambda s: s.lower(), dict(self.proj.languages()).keys()):
continue
for f in self._gen_file_list():
if not tools:
break
for tool in tools:
if tool.check_file(f):
logger.info(f"Removing tool {tool.name} for lanugage {lang}")
detected_tools[lang].append(tool)
tools.remove(tool)
logger.info(
"Detected SAST tools "
f"{[(lang, [tool.name for tool in tools]) for lang, tools in detected_tools.items()]}"
f" for project {self.proj.id}."
)
return detected_tools
[docs]
def _calc_score(self, detected_tools: Dict[str, List[SastTool]]) -> float:
"""
Consumes the result of`_detect_sast_tools` and calculates the final
score out of it.
:return: score
"""
score: float = 0.0
for lang, lweight in dict(self.proj.languages()).items():
lang: str = str(lang)
lweight: float = float(lweight) / 100
logging.info(f"Language {lang} has weight {lweight}")
tweight: float = 0.0
for tool in detected_tools[lang.lower()]:
tweight = max(tweight, tool.weight)
logging.info(f"Maximum tool weight is {tweight}")
score += float(lweight) * tweight
return score
[docs]
def run(self, args_dict: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
ret: Dict[str, Any] = super().run(args_dict)
detected_tools: Dict[str, List[SastTool]] = self._detect_sast_tools()
results: Dict[str, Any] = {
"lang_tools": [
[lang, [tool.name for tool in tools]]
for lang, tools in detected_tools.items()
],
}
assert self.results_valid(results)
return {
"score": self._calc_score(detected_tools),
"results": results,
} | ret