Source code for src.checks.checked_in_binaries

"""Check that determines the file type for every file in
the project and compares it to a blacklist of binary executable file
formats"""

import re
import logging

from typing import (
    List,
    Dict,
    Tuple,
    Type,
    DefaultDict,
    Any,
    Hashable,
    Set,
    Optional,
)
from pathlib import Path
from collections import defaultdict
from fact_helper_file import get_file_type_from_path
from git.repo import Repo

from src.config import context
from src.opencode_git import OpenCodeGit
from src.interfaces import CheckInterface
from .interfaces_checked_in_binaries import (
    FileTypeInterface,
    FileTypeToolInterface,
)

logger = logging.getLogger(__name__)

# Tool: fkie-cad / fact_helper_file



[docs]
class FactHelperFileFileType(FileTypeInterface):

[docs]
    def __init__(self, ft: Dict[str, str]) -> None:
        self.mime: str = ft["mime"]
        self.full: str = ft["full"]



[docs]
    def _key(self) -> Tuple[Hashable, ...]:
        return (self.mime,)





[docs]
class FactHelperFile(FileTypeToolInterface):

[docs]
    def file_type_of(self, file: Path) -> FactHelperFileFileType:
        return FactHelperFileFileType(get_file_type_from_path(file))




# check



[docs]
class CheckedInBinaries(CheckInterface):
    """Represents a check that determies the file type for every file in
    the project and compares it to a blacklist of binary executable file
    formats"""

    blacklist_dir: Path = context.settings["CheckedInBinaries_blacklist_dir"]
    exclude: re.Pattern = re.compile("(^.git$|test)")
    blacklist: Set[FileTypeInterface] = set()
    whitelist: Set[FileTypeInterface] = set(
        [
            FactHelperFileFileType(
                {"mime": "application/octet-stream", "full": "data"}
            )
        ]
    )
    fileTypeTools: List[Type[FileTypeToolInterface]] = [FactHelperFile]


[docs]
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        if len(self.blacklist) == 0:
            logger.info("Initializing blacklist of executable file formats")
            self.__init_blacklist()
            logger.info(f"Using blacklist {self.blacklist}")

        self.all_tool_findings: List[
            Tuple[
                Type[FileTypeToolInterface],
                Dict[FileTypeInterface, List[Path]],
            ]
        ] = []
        self.all_violations: Dict[FileTypeInterface, List[Path]] = defaultdict(
            list
        )


    def __update_blacklist(self) -> None:
        if list(self.blacklist_dir.iterdir()):
            # check for updates of an existing blacklist
            logger.info("Existing blacklist found, updating...")
            repo: Repo = Repo(self.blacklist_dir)
            git = repo.git
            git.fetch()
            git.merge("--strategy-option", "theirs", "--no-edit")
            git.pull("-X", "theirs")
        else:
            # fetch a new copy of the blacklist
            logger.info("You have no blacklist, fetching current version ...")
            OpenCodeGit.clone_project(
                context.settings["CheckedInBinaries_blacklist_repo"],
                self.blacklist_dir,
            )

    def __init_blacklist(self) -> None:
        self.__update_blacklist()

        for bad_file in self.blacklist_dir.iterdir():
            if "README.md" in bad_file.name or ".git" in bad_file.name:
                continue
            logger.debug(f"Processing known bad file {bad_file}")
            for tool in self.fileTypeTools:
                tool_instance: FileTypeToolInterface = tool()
                file_type: FileTypeInterface = tool_instance.file_type_of(
                    bad_file
                )
                logger.debug(f"{tool} classfied {bad_file} as {file_type}")
                if self.__is_too_generic(file_type):
                    # will produce too many false positives
                    logger.debug(
                        f"Not adding {file_type} to blacklist as it is "
                        " too generic"
                    )
                    continue
                self.blacklist.add(file_type)

    def __is_too_generic(self, file_type: FileTypeInterface) -> bool:
        return file_type in self.whitelist


[docs]
    def _run_all_tools(
        self,
    ) -> None:
        """For each available tool or library the set of detected file
        types is determined. All files with illegal file types are
        recorded."""
        logger.info(
            "Running multi-tool file type detection for "
            f"project {self.proj.id}"
        )

        for tool in self.fileTypeTools:
            logger.info(
                f"Using {tool.name} to detect file types in project"
                f" {self.proj.id}"
            )
            tool_instance: FileTypeToolInterface = tool()
            findings: DefaultDict[FileTypeInterface, List[Path]] = defaultdict(
                list
            )
            for file in self._gen_file_list():
                file_type: FileTypeInterface = tool_instance.file_type_of(file)
                logger.debug(f"{tool.name} classfied {file} as {file_type}")
                # only record files with disallowed file types
                if self._is_ok(file_type):
                    # it is a defaultdict so we do this for its
                    # side-effects
                    #
                    # pylint: disable-next=pointless-statement
                    findings[file_type]
                else:
                    logger.info(
                        f"{file} with {file_type} is considered harmful"
                    )
                    findings[file_type].append(file)

            logger.info(
                f"{tool.name} detected file types "
                f"{self.__format_findings(findings)}"
            )
            self.all_tool_findings.append((tool, findings))

        logger.info(
            f"Results of file type detection: {self._format_findings()}"
        )



[docs]
    def _format_findings(self) -> Dict[str, Dict[str, List[str]]]:
        ret: Dict[str, Dict[str, List[str]]] = {}
        for tool, findings in self.all_tool_findings:
            ret |= {tool.name: self.__format_findings(findings)}

        return ret


    def __format_findings(
        self, findings: Dict[FileTypeInterface, List[Path]]
    ) -> Dict[str, List[str]]:
        ret: Dict[str, List[str]] = {}
        for file_type, paths in findings.items():
            ret |= {str(file_type): [p.as_posix() for p in paths]}

        return ret


[docs]
    def _is_ok(self, file_type: FileTypeInterface) -> bool:
        return file_type not in self.blacklist



[docs]
    def _calc_score(
        self,
    ) -> float:
        return 0.0 if self.all_violations else 1.0



[docs]
    def _determine_violations(self):
        for _, findings in self.all_tool_findings:
            for file_type, path_list in findings.items():
                if len(path_list):
                    self.all_violations[file_type].extend(path_list)



[docs]
    def run(self, args_dict: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
        ret: Dict[str, Any] = super().run(args_dict)
        self._run_all_tools()
        self._determine_violations()
        results: Dict[str, Any] = {
            "ft_paths": [
                [file_type, paths]
                for file_type, paths in self.__format_findings(
                    self.all_violations
                ).items()
            ]
        }
        assert self.results_valid(results)

        return ret | {
            "score": self._calc_score(),
            "results": results,
        }
Source code for src.checks.checked_in_binaries

occmd

Navigation

Related Topics