Source code for src.checks.checked_in_binaries

"""Check that determines the file type for every file in
the project and compares it to a blacklist of binary executable file
formats"""

import re
import logging

from typing import (
    List,
    Dict,
    Tuple,
    Type,
    DefaultDict,
    Any,
    Hashable,
    Set,
    Optional,
)
from pathlib import Path
from collections import defaultdict
from fact_helper_file import get_file_type_from_path
from git.repo import Repo

from src.config import context
from src.opencode_git import OpenCodeGit
from src.interfaces import CheckInterface
from .interfaces_checked_in_binaries import (
    FileTypeInterface,
    FileTypeToolInterface,
)

logger = logging.getLogger(__name__)

# Tool: fkie-cad / fact_helper_file


[docs] class FactHelperFileFileType(FileTypeInterface):
[docs] def __init__(self, ft: Dict[str, str]) -> None: self.mime: str = ft["mime"] self.full: str = ft["full"]
[docs] def _key(self) -> Tuple[Hashable, ...]: return (self.mime,)
[docs] class FactHelperFile(FileTypeToolInterface):
[docs] def file_type_of(self, file: Path) -> FactHelperFileFileType: return FactHelperFileFileType(get_file_type_from_path(file))
# check
[docs] class CheckedInBinaries(CheckInterface): """Represents a check that determies the file type for every file in the project and compares it to a blacklist of binary executable file formats""" blacklist_dir: Path = context.settings["CheckedInBinaries_blacklist_dir"] exclude: re.Pattern = re.compile("(^.git$|test)") blacklist: Set[FileTypeInterface] = set() whitelist: Set[FileTypeInterface] = set( [ FactHelperFileFileType( {"mime": "application/octet-stream", "full": "data"} ) ] ) fileTypeTools: List[Type[FileTypeToolInterface]] = [FactHelperFile]
[docs] def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) if len(self.blacklist) == 0: logger.info("Initializing blacklist of executable file formats") self.__init_blacklist() logger.info(f"Using blacklist {self.blacklist}") self.all_tool_findings: List[ Tuple[ Type[FileTypeToolInterface], Dict[FileTypeInterface, List[Path]], ] ] = [] self.all_violations: Dict[FileTypeInterface, List[Path]] = defaultdict( list )
def __update_blacklist(self) -> None: if list(self.blacklist_dir.iterdir()): # check for updates of an existing blacklist logger.info("Existing blacklist found, updating...") repo: Repo = Repo(self.blacklist_dir) git = repo.git git.fetch() git.merge("--strategy-option", "theirs", "--no-edit") git.pull("-X", "theirs") else: # fetch a new copy of the blacklist logger.info("You have no blacklist, fetching current version ...") OpenCodeGit.clone_project( context.settings["CheckedInBinaries_blacklist_repo"], self.blacklist_dir, ) def __init_blacklist(self) -> None: self.__update_blacklist() for bad_file in self.blacklist_dir.iterdir(): if "README.md" in bad_file.name or ".git" in bad_file.name: continue logger.debug(f"Processing known bad file {bad_file}") for tool in self.fileTypeTools: tool_instance: FileTypeToolInterface = tool() file_type: FileTypeInterface = tool_instance.file_type_of( bad_file ) logger.debug(f"{tool} classfied {bad_file} as {file_type}") if self.__is_too_generic(file_type): # will produce too many false positives logger.debug( f"Not adding {file_type} to blacklist as it is " " too generic" ) continue self.blacklist.add(file_type) def __is_too_generic(self, file_type: FileTypeInterface) -> bool: return file_type in self.whitelist
[docs] def _run_all_tools( self, ) -> None: """For each available tool or library the set of detected file types is determined. All files with illegal file types are recorded.""" logger.info( "Running multi-tool file type detection for " f"project {self.proj.id}" ) for tool in self.fileTypeTools: logger.info( f"Using {tool.name} to detect file types in project" f" {self.proj.id}" ) tool_instance: FileTypeToolInterface = tool() findings: DefaultDict[FileTypeInterface, List[Path]] = defaultdict( list ) for file in self._gen_file_list(): file_type: FileTypeInterface = tool_instance.file_type_of(file) logger.debug(f"{tool.name} classfied {file} as {file_type}") # only record files with disallowed file types if self._is_ok(file_type): # it is a defaultdict so we do this for its # side-effects # # pylint: disable-next=pointless-statement findings[file_type] else: logger.info( f"{file} with {file_type} is considered harmful" ) findings[file_type].append(file) logger.info( f"{tool.name} detected file types " f"{self.__format_findings(findings)}" ) self.all_tool_findings.append((tool, findings)) logger.info( f"Results of file type detection: {self._format_findings()}" )
[docs] def _format_findings(self) -> Dict[str, Dict[str, List[str]]]: ret: Dict[str, Dict[str, List[str]]] = {} for tool, findings in self.all_tool_findings: ret |= {tool.name: self.__format_findings(findings)} return ret
def __format_findings( self, findings: Dict[FileTypeInterface, List[Path]] ) -> Dict[str, List[str]]: ret: Dict[str, List[str]] = {} for file_type, paths in findings.items(): ret |= {str(file_type): [p.as_posix() for p in paths]} return ret
[docs] def _is_ok(self, file_type: FileTypeInterface) -> bool: return file_type not in self.blacklist
[docs] def _calc_score( self, ) -> float: return 0.0 if self.all_violations else 1.0
[docs] def _determine_violations(self): for _, findings in self.all_tool_findings: for file_type, path_list in findings.items(): if len(path_list): self.all_violations[file_type].extend(path_list)
[docs] def run(self, args_dict: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: ret: Dict[str, Any] = super().run(args_dict) self._run_all_tools() self._determine_violations() results: Dict[str, Any] = { "ft_paths": [ [file_type, paths] for file_type, paths in self.__format_findings( self.all_violations ).items() ] } assert self.results_valid(results) return ret | { "score": self._calc_score(), "results": results, }