peteretelej · peteretelej · Jul 19, 2025 · Jul 19, 2025 · Jul 19, 2025 · Jul 19, 2025
diff --git a/.codecov.yml b/.codecov.yml
@@ -0,0 +1,15 @@
+coverage:
+  status:
+    project:
+      default:
+        target: 85%
+        threshold: 3%
+    patch:
+      default:
+        target: 70%
+        threshold: 5%
+
+comment:
+  layout: "header, diff"
+  behavior: default
+  require_changes: true
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "diffchunk"
-version = "0.1.6"
+version = "0.1.7"
 description = "MCP server for navigating large diff files with intelligent chunking"
 readme = "README.md"
 requires-python = ">=3.10"
@@ -26,6 +26,7 @@ classifiers = [
 dependencies = [
     "click>=8.2.1",
     "mcp>=1.10.0",
+    "chardet>=4.0.0",
 ]
 
 [project.urls]

diff --git a/src/chunker.py b/src/chunker.py
@@ -33,7 +33,7 @@ def chunk_diff(
             raise ValueError(f"Failed to parse diff: {e}")
 
         if not file_changes:
-            raise ValueError("No valid diff content found")
+            raise ValueError("Diff file parsed successfully but contains no changes")
 
         for files, content in file_changes:
             # Apply filters

diff --git a/src/parser.py b/src/parser.py
@@ -123,29 +123,32 @@ def should_include_file(
         return True  # Include by default if no patterns specified
 
     def _read_diff_file(self, file_path: str) -> List[str]:
-        """Read diff file with proper encoding handling."""
-        # Try UTF-8 first (most common)
-        encodings = ["utf-8", "utf-8-sig", "cp1252", "latin-1"]
+        """Read diff file with encoding detection."""
+        import chardet
 
-        for encoding in encodings:
-            try:
-                with open(file_path, "r", encoding=encoding) as f:
-                    content = f.read()
+        # Detect encoding from sample
+        with open(file_path, "rb") as f:
+            sample = f.read(8192)
+        result = chardet.detect(sample)
 
-                # Strip BOM if present
-                if content.startswith("\ufeff"):
-                    content = content[1:]
+        # Use detected encoding if confident, otherwise UTF-8
+        encoding = (
+            result.get("encoding") if result.get("confidence", 0) > 0.7 else "utf-8"
+        )
 
-                lines = content.splitlines(keepends=True)
-                return lines
+        try:
+            with open(file_path, "r", encoding=encoding) as f:
+                content = f.read()
+        except UnicodeDecodeError:
+            # Fallback with error replacement
+            with open(file_path, "r", encoding="utf-8", errors="replace") as f:
+                content = f.read()
 
-            except (UnicodeDecodeError, IOError):
-                continue
+        # Strip BOM if present
+        if content.startswith("\ufeff"):
+            content = content[1:]
 
-        # If all encodings failed, raise clear error
-        raise ValueError(
-            f"Cannot read diff file {file_path}: unable to decode with any common encoding"
-        )
+        return content.splitlines(keepends=True)
 
     def count_lines(self, content: str) -> int:
         """Count meaningful lines in diff content."""

diff --git a/tests/test_encodings.py b/tests/test_encodings.py
@@ -0,0 +1,60 @@
+"""Test encoding support for diff files."""
+
+import pytest
+from pathlib import Path
+
+from src.tools import DiffChunkTools
+
+
+class TestEncodings:
+    """Test encoding detection and parsing."""
+
+    @pytest.fixture
+    def test_data_dir(self):
+        return Path(__file__).parent / "test_data"
+
+    @pytest.fixture
+    def tools(self):
+        return DiffChunkTools()
+
+    def test_encoding_support(self, tools, test_data_dir):
+        """Test that various encodings are supported."""
+        test_files = [
+            "minimal_working.diff",  # UTF-8
+            "minimal_windows.diff",  # Windows line endings
+            "minimal_bom.diff",  # UTF-8 BOM
+            "minimal_latin1.diff",  # Latin-1
+        ]
+
+        for filename in test_files:
+            result = tools.load_diff(str(test_data_dir / filename))
+            assert result["chunks"] > 0, f"{filename} should parse successfully"
+            assert result["files"] > 0, f"{filename} should contain files"
+
+    def test_encoding_detection(self, tools, tmp_path):
+        """Test encoding detection with UTF-16."""
+        # Create a minimal UTF-16 diff file
+        content = """diff --git a/test.txt b/test.txt
+index 1234567..abcdefg 100644
+--- a/test.txt
++++ b/test.txt
+@@ -1 +1 @@
+-old line
++new line
+"""
+        utf16_file = tmp_path / "test_utf16.diff"
+        utf16_file.write_text(content, encoding="utf-16")
+
+        result = tools.load_diff(str(utf16_file))
+        assert result["chunks"] > 0
+        assert result["files"] > 0
+
+    def test_empty_diff_error(self, tools, tmp_path):
+        """Test error message for empty diff files."""
+        empty_file = tmp_path / "empty.diff"
+        empty_file.write_text("")
+
+        with pytest.raises(
+            ValueError, match="Diff file parsed successfully but contains no changes"
+        ):
+            tools.load_diff(str(empty_file))
diff --git a/tests/test_windows_repro.py b/tests/test_windows_repro.py
diff --git a/uv.lock b/uv.lock