Merge pull request #152 from advanced-security/v2_9_1

V2.9.1
advanced-security · Jan 10, 2025 · ecab553 · ecab553
2 parents da84215 + aad03d0
commit ecab553
Show file tree

Hide file tree

Showing 15 changed files with 314 additions and 148 deletions.
diff --git a/.release.yml b/.release.yml
@@ -1,11 +1,15 @@
 name: "policy-as-code"
-version: "2.9.0"
+repository: "advanced-security/policy-as-code"
+version: "2.9.1"
+
+ecosystems:
+  - Python
 
 locations:
   - name: "Update Docs"
     paths:
       - "*.md"
+      - "docs/*.md"
     patterns:
-      - 'advanced-security/policy-as-code@v([0-9]\.[0-9]\.[0-9])'
-      - '--branch "v([0-9]\.[0-9]\.[0-9])"'
-
+      - "{repository}@v{version}"
+      - '--branch "v{version}"'
diff --git a/Pipfile b/Pipfile
@@ -6,7 +6,7 @@ verify_ssl = true
 [packages]
 pyyaml = "*"
 semantic-version = "*"
-ghastoolkit = "==0.14.2"
+ghastoolkit = "==0.15.1"
 
 [dev-packages]
 sphinx = "*"

diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/README.md b/README.md
@@ -45,7 +45,7 @@ Here is how you can quickly setup policy-as-code.
 ```yaml
 # Policy as Code
 - name: Advance Security Policy as Code
-  uses: advanced-security/policy-as-code@v2.9.0
+  uses: advanced-security/policy-as-code@v2.9.1
 ```
 
 > [!WARNING]
@@ -61,15 +61,15 @@ The Policy as Code project is a self-contained Python based CLI tool.
 **Bash / Zsh:**
 
 ```bash
-git clone --branch "v2.9.0" https://github.com/advanced-security/policy-as-code.git && cd ./policy-as-code
+git clone --branch "v2.9.1" https://github.com/advanced-security/policy-as-code.git && cd ./policy-as-code
 
 ./policy-as-code --help
 ```
 
 **Powershell:**
 
 ```Powershell
-git clone --branch "v2.9.0" https://github.com/advanced-security/policy-as-code.git
+git clone --branch "v2.9.1" https://github.com/advanced-security/policy-as-code.git
 cd policy-as-code
 
 .\policy-as-code.ps1 --help
@@ -128,7 +128,7 @@ Here is an example of using a simple yet cross-organization using Policy as Code
 ```yaml
 # Compliance
 - name: Advance Security Policy as Code
-  uses: advanced-security/policy-as-code@v2.9.0
+  uses: advanced-security/policy-as-code@v2.9.1
   with:
     # The owner/repo of where the policy is stored
     policy: GeekMasher/security-queries

diff --git a/ghascompliance/__version__.py b/ghascompliance/__version__.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-__version__ = "2.9.0"
+__version__ = "2.9.1"
 
 __title__ = "GitHub Advanced Security Policy as Code"
 __name__ = "ghascompliance"

diff --git a/vendor/charset_normalizer/api.py b/vendor/charset_normalizer/api.py
@@ -159,6 +159,8 @@ def from_bytes(
 
     results: CharsetMatches = CharsetMatches()
 
+    early_stop_results: CharsetMatches = CharsetMatches()
+
     sig_encoding, sig_payload = identify_sig_or_bom(sequences)
 
     if sig_encoding is not None:
@@ -221,16 +223,20 @@ def from_bytes(
         try:
             if is_too_large_sequence and is_multi_byte_decoder is False:
                 str(
-                    sequences[: int(50e4)]
-                    if strip_sig_or_bom is False
-                    else sequences[len(sig_payload) : int(50e4)],
+                    (
+                        sequences[: int(50e4)]
+                        if strip_sig_or_bom is False
+                        else sequences[len(sig_payload) : int(50e4)]
+                    ),
                     encoding=encoding_iana,
                 )
             else:
                 decoded_payload = str(
-                    sequences
-                    if strip_sig_or_bom is False
-                    else sequences[len(sig_payload) :],
+                    (
+                        sequences
+                        if strip_sig_or_bom is False
+                        else sequences[len(sig_payload) :]
+                    ),
                     encoding=encoding_iana,
                 )
         except (UnicodeDecodeError, LookupError) as e:
@@ -367,7 +373,13 @@ def from_bytes(
                 and not lazy_str_hard_failure
             ):
                 fallback_entry = CharsetMatch(
-                    sequences, encoding_iana, threshold, False, [], decoded_payload
+                    sequences,
+                    encoding_iana,
+                    threshold,
+                    False,
+                    [],
+                    decoded_payload,
+                    preemptive_declaration=specified_encoding,
                 )
                 if encoding_iana == specified_encoding:
                     fallback_specified = fallback_entry
@@ -421,28 +433,58 @@ def from_bytes(
                 ),
             )
 
-        results.append(
-            CharsetMatch(
-                sequences,
-                encoding_iana,
-                mean_mess_ratio,
-                bom_or_sig_available,
-                cd_ratios_merged,
-                decoded_payload,
-            )
+        current_match = CharsetMatch(
+            sequences,
+            encoding_iana,
+            mean_mess_ratio,
+            bom_or_sig_available,
+            cd_ratios_merged,
+            (
+                decoded_payload
+                if (
+                    is_too_large_sequence is False
+                    or encoding_iana in [specified_encoding, "ascii", "utf_8"]
+                )
+                else None
+            ),
+            preemptive_declaration=specified_encoding,
         )
 
+        results.append(current_match)
+
         if (
             encoding_iana in [specified_encoding, "ascii", "utf_8"]
             and mean_mess_ratio < 0.1
         ):
+            # If md says nothing to worry about, then... stop immediately!
+            if mean_mess_ratio == 0.0:
+                logger.debug(
+                    "Encoding detection: %s is most likely the one.",
+                    current_match.encoding,
+                )
+                if explain:
+                    logger.removeHandler(explain_handler)
+                    logger.setLevel(previous_logger_level)
+                return CharsetMatches([current_match])
+
+            early_stop_results.append(current_match)
+
+        if (
+            len(early_stop_results)
+            and (specified_encoding is None or specified_encoding in tested)
+            and "ascii" in tested
+            and "utf_8" in tested
+        ):
+            probable_result: CharsetMatch = early_stop_results.best()  # type: ignore[assignment]
             logger.debug(
-                "Encoding detection: %s is most likely the one.", encoding_iana
+                "Encoding detection: %s is most likely the one.",
+                probable_result.encoding,
             )
             if explain:
                 logger.removeHandler(explain_handler)
                 logger.setLevel(previous_logger_level)
-            return CharsetMatches([results[encoding_iana]])
+
+            return CharsetMatches([probable_result])
 
         if encoding_iana == sig_encoding:
             logger.debug(

diff --git a/vendor/charset_normalizer/cli/__main__.py b/vendor/charset_normalizer/cli/__main__.py
@@ -109,6 +109,14 @@ def cli_detect(argv: Optional[List[str]] = None) -> int:
         dest="force",
         help="Replace file without asking if you are sure, use this flag with caution.",
     )
+    parser.add_argument(
+        "-i",
+        "--no-preemptive",
+        action="store_true",
+        default=False,
+        dest="no_preemptive",
+        help="Disable looking at a charset declaration to hint the detector.",
+    )
     parser.add_argument(
         "-t",
         "--threshold",
@@ -133,31 +141,47 @@ def cli_detect(argv: Optional[List[str]] = None) -> int:
     args = parser.parse_args(argv)
 
     if args.replace is True and args.normalize is False:
+        if args.files:
+            for my_file in args.files:
+                my_file.close()
         print("Use --replace in addition of --normalize only.", file=sys.stderr)
         return 1
 
     if args.force is True and args.replace is False:
+        if args.files:
+            for my_file in args.files:
+                my_file.close()
         print("Use --force in addition of --replace only.", file=sys.stderr)
         return 1
 
     if args.threshold < 0.0 or args.threshold > 1.0:
+        if args.files:
+            for my_file in args.files:
+                my_file.close()
         print("--threshold VALUE should be between 0. AND 1.", file=sys.stderr)
         return 1
 
     x_ = []
 
     for my_file in args.files:
-        matches = from_fp(my_file, threshold=args.threshold, explain=args.verbose)
+        matches = from_fp(
+            my_file,
+            threshold=args.threshold,
+            explain=args.verbose,
+            preemptive_behaviour=args.no_preemptive is False,
+        )
 
         best_guess = matches.best()
 
         if best_guess is None:
             print(
                 'Unable to identify originating encoding for "{}". {}'.format(
                     my_file.name,
-                    "Maybe try increasing maximum amount of chaos."
-                    if args.threshold < 1.0
-                    else "",
+                    (
+                        "Maybe try increasing maximum amount of chaos."
+                        if args.threshold < 1.0
+                        else ""
+                    ),
                 ),
                 file=sys.stderr,
             )
@@ -258,8 +282,8 @@ def cli_detect(argv: Optional[List[str]] = None) -> int:
                 try:
                     x_[0].unicode_path = join(dir_path, ".".join(o_))
 
-                    with open(x_[0].unicode_path, "w", encoding="utf-8") as fp:
-                        fp.write(str(best_guess))
+                    with open(x_[0].unicode_path, "wb") as fp:
+                        fp.write(best_guess.output())
                 except IOError as e:
                     print(str(e), file=sys.stderr)
                     if my_file.closed is False:

diff --git a/vendor/charset_normalizer/constant.py b/vendor/charset_normalizer/constant.py
@@ -544,6 +544,8 @@
     "|",
     '"',
     "-",
+    "(",
+    ")",
 }
 
 

diff --git a/vendor/charset_normalizer/legacy.py b/vendor/charset_normalizer/legacy.py
@@ -1,13 +1,24 @@
-from typing import Any, Dict, Optional, Union
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any, Optional
 from warnings import warn
 
 from .api import from_bytes
 from .constant import CHARDET_CORRESPONDENCE
 
+# TODO: remove this check when dropping Python 3.7 support
+if TYPE_CHECKING:
+    from typing_extensions import TypedDict
+
+    class ResultDict(TypedDict):
+        encoding: Optional[str]
+        language: str
+        confidence: Optional[float]
+
 
 def detect(
     byte_str: bytes, should_rename_legacy: bool = False, **kwargs: Any
-) -> Dict[str, Optional[Union[str, float]]]:
+) -> ResultDict:
     """
     chardet legacy method
     Detect the encoding of the given byte string. It should be mostly backward-compatible.

diff --git a/vendor/charset_normalizer/md.py b/vendor/charset_normalizer/md.py
@@ -236,7 +236,7 @@ def reset(self) -> None:  # pragma: no cover
 
     @property
     def ratio(self) -> float:
-        if self._character_count <= 24:
+        if self._character_count <= 13:
             return 0.0
 
         ratio_of_suspicious_range_usage: float = (
@@ -260,6 +260,7 @@ def __init__(self) -> None:
 
         self._buffer: str = ""
         self._buffer_accent_count: int = 0
+        self._buffer_glyph_count: int = 0
 
     def eligible(self, character: str) -> bool:
         return True
@@ -279,6 +280,14 @@ def feed(self, character: str) -> None:
                 and is_thai(character) is False
             ):
                 self._foreign_long_watch = True
+            if (
+                is_cjk(character)
+                or is_hangul(character)
+                or is_katakana(character)
+                or is_hiragana(character)
+                or is_thai(character)
+            ):
+                self._buffer_glyph_count += 1
             return
         if not self._buffer:
             return
@@ -291,17 +300,20 @@ def feed(self, character: str) -> None:
             self._character_count += buffer_length
 
             if buffer_length >= 4:
-                if self._buffer_accent_count / buffer_length > 0.34:
+                if self._buffer_accent_count / buffer_length >= 0.5:
                     self._is_current_word_bad = True
                 # Word/Buffer ending with an upper case accentuated letter are so rare,
                 # that we will consider them all as suspicious. Same weight as foreign_long suspicious.
-                if (
+                elif (
                     is_accentuated(self._buffer[-1])
                     and self._buffer[-1].isupper()
                     and all(_.isupper() for _ in self._buffer) is False
                 ):
                     self._foreign_long_count += 1
                     self._is_current_word_bad = True
+                elif self._buffer_glyph_count == 1:
+                    self._is_current_word_bad = True
+                    self._foreign_long_count += 1
             if buffer_length >= 24 and self._foreign_long_watch:
                 camel_case_dst = [
                     i
@@ -325,6 +337,7 @@ def feed(self, character: str) -> None:
             self._foreign_long_watch = False
             self._buffer = ""
             self._buffer_accent_count = 0
+            self._buffer_glyph_count = 0
         elif (
             character not in {"<", ">", "-", "=", "~", "|", "_"}
             and character.isdigit() is False
-Original file line number
+Diff line change
@@ Expand Up / @@ -544,6 +544,8 @@ @@
         "|",
         '"',
         "-",
+        "(",
+        ")",
     }
@@ Expand Down @@