From 0268ebbe6a375ef0b3bbf6bc769588525a6ddaf4 Mon Sep 17 00:00:00 2001 From: taohe Date: Thu, 11 Jun 2026 21:08:08 +0800 Subject: [PATCH] Use sha256 user buckets with legacy migration --- backend/app/channels/manager.py | 14 ++++- .../packages/harness/deerflow/config/paths.py | 52 ++++++++++++++++++- backend/tests/test_channels.py | 22 ++++++++ backend/tests/test_paths_user_isolation.py | 34 +++++++++++- 4 files changed, 118 insertions(+), 4 deletions(-) diff --git a/backend/app/channels/manager.py b/backend/app/channels/manager.py index 1b2a6bbc8..3c8998c0f 100644 --- a/backend/app/channels/manager.py +++ b/backend/app/channels/manager.py @@ -483,6 +483,16 @@ def _owner_headers(msg: InboundMessage) -> dict[str, str] | None: return create_internal_auth_headers(owner_user_id=owner_user_id) +def _safe_user_id_for_run(raw_user_id: str) -> str: + from deerflow.config.paths import get_paths + + try: + return get_paths().prepare_user_dir_for_raw_id(raw_user_id) + except Exception: + logger.exception("Failed to prepare channel run user directory") + return make_safe_user_id(raw_user_id) + + def _resolve_slash_skill_command( text: str, available_skills: set[str] | None = None, @@ -780,9 +790,9 @@ class ChannelManager: run_context_identity: dict[str, Any] = {"thread_id": thread_id} owner_user_id = _effective_owner_user_id(msg) if owner_user_id: - run_context_identity["user_id"] = make_safe_user_id(owner_user_id) + run_context_identity["user_id"] = _safe_user_id_for_run(owner_user_id) elif msg.user_id: - run_context_identity["user_id"] = make_safe_user_id(msg.user_id) + run_context_identity["user_id"] = _safe_user_id_for_run(msg.user_id) if msg.user_id: run_context_identity["channel_user_id"] = msg.user_id diff --git a/backend/packages/harness/deerflow/config/paths.py b/backend/packages/harness/deerflow/config/paths.py index f01959657..2d53b1b02 100644 --- a/backend/packages/harness/deerflow/config/paths.py +++ b/backend/packages/harness/deerflow/config/paths.py @@ -1,4 +1,5 @@ import hashlib +import logging import os import re import shutil @@ -13,6 +14,9 @@ _SAFE_THREAD_ID_RE = re.compile(r"^[A-Za-z0-9_\-]+$") _SAFE_USER_ID_RE = re.compile(r"^[A-Za-z0-9_\-]+$") _UNSAFE_USER_ID_CHAR_RE = re.compile(r"[^A-Za-z0-9_\-]") _SAFE_USER_ID_DIGEST_HEX_LEN = 16 +_SAFE_USER_ID_DIGEST_RE = re.compile(r"^[0-9a-f]{16}$") + +logger = logging.getLogger(__name__) def _default_local_base_dir() -> Path: @@ -47,10 +51,17 @@ def make_safe_user_id(raw: str) -> str: sanitized = _UNSAFE_USER_ID_CHAR_RE.sub("-", raw) if sanitized == raw: return raw - digest = hashlib.sha1(raw.encode("utf-8")).hexdigest()[:_SAFE_USER_ID_DIGEST_HEX_LEN] + digest = hashlib.sha256(raw.encode("utf-8")).hexdigest()[:_SAFE_USER_ID_DIGEST_HEX_LEN] return f"{sanitized}-{digest}" +def _looks_like_digested_user_dir(name: str, prefix: str) -> bool: + if not name.startswith(prefix): + return False + suffix = name[len(prefix) :] + return bool(_SAFE_USER_ID_DIGEST_RE.match(suffix)) + + def _join_host_path(base: str, *parts: str) -> str: """Join host filesystem path segments while preserving native style. @@ -172,6 +183,45 @@ class Paths: """Directory for a specific user: `{base_dir}/users/{user_id}/`.""" return self.base_dir / "users" / _validate_user_id(user_id) + def prepare_user_dir_for_raw_id(self, raw_user_id: str) -> str: + """Return the safe user ID and migrate a unique legacy unsafe-id bucket. + + A previous branch revision used a weak digest for unsafe external user IDs. + New IDs use SHA-256, but if exactly one old-style bucket with the same + sanitized prefix already exists, move it to the current bucket name so + existing local memory/files/threads remain visible. + """ + safe_user_id = make_safe_user_id(raw_user_id) + sanitized = _UNSAFE_USER_ID_CHAR_RE.sub("-", raw_user_id) + if safe_user_id == raw_user_id: + return safe_user_id + + users_dir = self.base_dir / "users" + target_dir = users_dir / safe_user_id + if target_dir.exists() or not users_dir.exists(): + return safe_user_id + + legacy_prefix = f"{sanitized}-" + try: + legacy_candidates = [candidate for candidate in users_dir.iterdir() if candidate.is_dir() and candidate.name != safe_user_id and _looks_like_digested_user_dir(candidate.name, legacy_prefix)] + except OSError: + logger.exception("Failed to inspect user directories for legacy unsafe-id migration") + return safe_user_id + + if not legacy_candidates: + return safe_user_id + + if len(legacy_candidates) > 1: + logger.warning("Multiple legacy unsafe-id user directories matched; skipping automatic migration") + return safe_user_id + + try: + legacy_candidates[0].rename(target_dir) + logger.info("Migrated legacy unsafe-id user directory to the current digest format") + except OSError: + logger.exception("Failed to migrate legacy unsafe-id user directory") + return safe_user_id + def user_memory_file(self, user_id: str) -> Path: """Per-user memory file: `{base_dir}/users/{user_id}/memory.json`.""" return self.user_dir(user_id) / "memory.json" diff --git a/backend/tests/test_channels.py b/backend/tests/test_channels.py index 9309b00d7..17476649b 100644 --- a/backend/tests/test_channels.py +++ b/backend/tests/test_channels.py @@ -2461,6 +2461,28 @@ class TestResolveRunParamsUserId: assert run_context["user_id"] != raw assert run_context["channel_user_id"] == raw + def test_unsafe_user_id_migrates_unique_legacy_bucket(self, tmp_path, monkeypatch): + from deerflow.config.paths import Paths, make_safe_user_id + + paths = Paths(tmp_path) + legacy_dir = paths.base_dir / "users" / "user-example-com-63a710569261a24b" + legacy_dir.mkdir(parents=True) + (legacy_dir / "memory.json").write_text('{"legacy": true}\n', encoding="utf-8") + monkeypatch.setattr("deerflow.config.paths.get_paths", lambda: paths) + + manager = self._manager() + monkeypatch.delenv("DEER_FLOW_AUTH_DISABLED", raising=False) + raw = "user@example.com" + msg = InboundMessage(channel_name="feishu", chat_id="c", user_id=raw, text="hi") + + _, _, run_context = manager._resolve_run_params(msg, "thread-1") + + safe = make_safe_user_id(raw) + assert run_context["user_id"] == safe + assert paths.user_dir(safe).exists() + assert not legacy_dir.exists() + assert (paths.user_dir(safe) / "memory.json").read_text(encoding="utf-8") == '{"legacy": true}\n' + @pytest.mark.parametrize("raw_user_id", ["", None]) def test_empty_or_none_user_id_is_not_injected(self, raw_user_id, monkeypatch): manager = self._manager() diff --git a/backend/tests/test_paths_user_isolation.py b/backend/tests/test_paths_user_isolation.py index f2fed1bed..8bd9a5f52 100644 --- a/backend/tests/test_paths_user_isolation.py +++ b/backend/tests/test_paths_user_isolation.py @@ -1,5 +1,6 @@ """Tests for user-scoped path resolution in Paths.""" +import logging from pathlib import Path import pytest @@ -44,7 +45,7 @@ class TestMakeSafeUserId: # Sanitized prefix plus a stable digest of the original. assert result.startswith("user-example-com-") assert len(result.rsplit("-", 1)[1]) == 16 - assert result == "user-example-com-63a710569261a24b" + assert result == "user-example-com-b4c9a289323b21a0" assert make_safe_user_id("user@example.com") == result def test_sanitized_id_passes_validation(self, paths: Paths): @@ -70,6 +71,37 @@ class TestUserDir: def test_user_dir(self, paths: Paths): assert paths.user_dir("alice") == paths.base_dir / "users" / "alice" + def test_prepare_user_dir_migrates_unique_legacy_unsafe_bucket(self, paths: Paths): + from deerflow.config.paths import make_safe_user_id + + raw = "user@example.com" + safe = make_safe_user_id(raw) + legacy_dir = paths.base_dir / "users" / "user-example-com-63a710569261a24b" + legacy_dir.mkdir(parents=True) + (legacy_dir / "memory.json").write_text('{"legacy": true}\n', encoding="utf-8") + + assert paths.prepare_user_dir_for_raw_id(raw) == safe + + current_dir = paths.user_dir(safe) + assert current_dir.exists() + assert not legacy_dir.exists() + assert (current_dir / "memory.json").read_text(encoding="utf-8") == '{"legacy": true}\n' + + def test_prepare_user_dir_skips_ambiguous_legacy_unsafe_buckets(self, paths: Paths, caplog): + from deerflow.config.paths import make_safe_user_id + + users_dir = paths.base_dir / "users" + (users_dir / "a-b-1111111111111111").mkdir(parents=True) + (users_dir / "a-b-2222222222222222").mkdir(parents=True) + + caplog.set_level(logging.WARNING) + assert paths.prepare_user_dir_for_raw_id("a.b") == make_safe_user_id("a.b") + + assert not paths.user_dir(make_safe_user_id("a.b")).exists() + assert (users_dir / "a-b-1111111111111111").exists() + assert (users_dir / "a-b-2222222222222222").exists() + assert any("Multiple legacy unsafe-id user directories matched" in r.message for r in caplog.records) + class TestUserMemoryFile: def test_user_memory_file(self, paths: Paths):