mirror of
https://github.com/bytedance/deer-flow.git
synced 2026-06-11 18:05:58 +00:00
Use sha256 user buckets with legacy migration
This commit is contained in:
@@ -483,6 +483,16 @@ def _owner_headers(msg: InboundMessage) -> dict[str, str] | None:
|
|||||||
return create_internal_auth_headers(owner_user_id=owner_user_id)
|
return create_internal_auth_headers(owner_user_id=owner_user_id)
|
||||||
|
|
||||||
|
|
||||||
|
def _safe_user_id_for_run(raw_user_id: str) -> str:
|
||||||
|
from deerflow.config.paths import get_paths
|
||||||
|
|
||||||
|
try:
|
||||||
|
return get_paths().prepare_user_dir_for_raw_id(raw_user_id)
|
||||||
|
except Exception:
|
||||||
|
logger.exception("Failed to prepare channel run user directory")
|
||||||
|
return make_safe_user_id(raw_user_id)
|
||||||
|
|
||||||
|
|
||||||
def _resolve_slash_skill_command(
|
def _resolve_slash_skill_command(
|
||||||
text: str,
|
text: str,
|
||||||
available_skills: set[str] | None = None,
|
available_skills: set[str] | None = None,
|
||||||
@@ -780,9 +790,9 @@ class ChannelManager:
|
|||||||
run_context_identity: dict[str, Any] = {"thread_id": thread_id}
|
run_context_identity: dict[str, Any] = {"thread_id": thread_id}
|
||||||
owner_user_id = _effective_owner_user_id(msg)
|
owner_user_id = _effective_owner_user_id(msg)
|
||||||
if owner_user_id:
|
if owner_user_id:
|
||||||
run_context_identity["user_id"] = make_safe_user_id(owner_user_id)
|
run_context_identity["user_id"] = _safe_user_id_for_run(owner_user_id)
|
||||||
elif msg.user_id:
|
elif msg.user_id:
|
||||||
run_context_identity["user_id"] = make_safe_user_id(msg.user_id)
|
run_context_identity["user_id"] = _safe_user_id_for_run(msg.user_id)
|
||||||
if msg.user_id:
|
if msg.user_id:
|
||||||
run_context_identity["channel_user_id"] = msg.user_id
|
run_context_identity["channel_user_id"] = msg.user_id
|
||||||
|
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
import hashlib
|
import hashlib
|
||||||
|
import logging
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import shutil
|
import shutil
|
||||||
@@ -13,6 +14,9 @@ _SAFE_THREAD_ID_RE = re.compile(r"^[A-Za-z0-9_\-]+$")
|
|||||||
_SAFE_USER_ID_RE = re.compile(r"^[A-Za-z0-9_\-]+$")
|
_SAFE_USER_ID_RE = re.compile(r"^[A-Za-z0-9_\-]+$")
|
||||||
_UNSAFE_USER_ID_CHAR_RE = re.compile(r"[^A-Za-z0-9_\-]")
|
_UNSAFE_USER_ID_CHAR_RE = re.compile(r"[^A-Za-z0-9_\-]")
|
||||||
_SAFE_USER_ID_DIGEST_HEX_LEN = 16
|
_SAFE_USER_ID_DIGEST_HEX_LEN = 16
|
||||||
|
_SAFE_USER_ID_DIGEST_RE = re.compile(r"^[0-9a-f]{16}$")
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
def _default_local_base_dir() -> Path:
|
def _default_local_base_dir() -> Path:
|
||||||
@@ -47,10 +51,17 @@ def make_safe_user_id(raw: str) -> str:
|
|||||||
sanitized = _UNSAFE_USER_ID_CHAR_RE.sub("-", raw)
|
sanitized = _UNSAFE_USER_ID_CHAR_RE.sub("-", raw)
|
||||||
if sanitized == raw:
|
if sanitized == raw:
|
||||||
return raw
|
return raw
|
||||||
digest = hashlib.sha1(raw.encode("utf-8")).hexdigest()[:_SAFE_USER_ID_DIGEST_HEX_LEN]
|
digest = hashlib.sha256(raw.encode("utf-8")).hexdigest()[:_SAFE_USER_ID_DIGEST_HEX_LEN]
|
||||||
return f"{sanitized}-{digest}"
|
return f"{sanitized}-{digest}"
|
||||||
|
|
||||||
|
|
||||||
|
def _looks_like_digested_user_dir(name: str, prefix: str) -> bool:
|
||||||
|
if not name.startswith(prefix):
|
||||||
|
return False
|
||||||
|
suffix = name[len(prefix) :]
|
||||||
|
return bool(_SAFE_USER_ID_DIGEST_RE.match(suffix))
|
||||||
|
|
||||||
|
|
||||||
def _join_host_path(base: str, *parts: str) -> str:
|
def _join_host_path(base: str, *parts: str) -> str:
|
||||||
"""Join host filesystem path segments while preserving native style.
|
"""Join host filesystem path segments while preserving native style.
|
||||||
|
|
||||||
@@ -172,6 +183,45 @@ class Paths:
|
|||||||
"""Directory for a specific user: `{base_dir}/users/{user_id}/`."""
|
"""Directory for a specific user: `{base_dir}/users/{user_id}/`."""
|
||||||
return self.base_dir / "users" / _validate_user_id(user_id)
|
return self.base_dir / "users" / _validate_user_id(user_id)
|
||||||
|
|
||||||
|
def prepare_user_dir_for_raw_id(self, raw_user_id: str) -> str:
|
||||||
|
"""Return the safe user ID and migrate a unique legacy unsafe-id bucket.
|
||||||
|
|
||||||
|
A previous branch revision used a weak digest for unsafe external user IDs.
|
||||||
|
New IDs use SHA-256, but if exactly one old-style bucket with the same
|
||||||
|
sanitized prefix already exists, move it to the current bucket name so
|
||||||
|
existing local memory/files/threads remain visible.
|
||||||
|
"""
|
||||||
|
safe_user_id = make_safe_user_id(raw_user_id)
|
||||||
|
sanitized = _UNSAFE_USER_ID_CHAR_RE.sub("-", raw_user_id)
|
||||||
|
if safe_user_id == raw_user_id:
|
||||||
|
return safe_user_id
|
||||||
|
|
||||||
|
users_dir = self.base_dir / "users"
|
||||||
|
target_dir = users_dir / safe_user_id
|
||||||
|
if target_dir.exists() or not users_dir.exists():
|
||||||
|
return safe_user_id
|
||||||
|
|
||||||
|
legacy_prefix = f"{sanitized}-"
|
||||||
|
try:
|
||||||
|
legacy_candidates = [candidate for candidate in users_dir.iterdir() if candidate.is_dir() and candidate.name != safe_user_id and _looks_like_digested_user_dir(candidate.name, legacy_prefix)]
|
||||||
|
except OSError:
|
||||||
|
logger.exception("Failed to inspect user directories for legacy unsafe-id migration")
|
||||||
|
return safe_user_id
|
||||||
|
|
||||||
|
if not legacy_candidates:
|
||||||
|
return safe_user_id
|
||||||
|
|
||||||
|
if len(legacy_candidates) > 1:
|
||||||
|
logger.warning("Multiple legacy unsafe-id user directories matched; skipping automatic migration")
|
||||||
|
return safe_user_id
|
||||||
|
|
||||||
|
try:
|
||||||
|
legacy_candidates[0].rename(target_dir)
|
||||||
|
logger.info("Migrated legacy unsafe-id user directory to the current digest format")
|
||||||
|
except OSError:
|
||||||
|
logger.exception("Failed to migrate legacy unsafe-id user directory")
|
||||||
|
return safe_user_id
|
||||||
|
|
||||||
def user_memory_file(self, user_id: str) -> Path:
|
def user_memory_file(self, user_id: str) -> Path:
|
||||||
"""Per-user memory file: `{base_dir}/users/{user_id}/memory.json`."""
|
"""Per-user memory file: `{base_dir}/users/{user_id}/memory.json`."""
|
||||||
return self.user_dir(user_id) / "memory.json"
|
return self.user_dir(user_id) / "memory.json"
|
||||||
|
|||||||
@@ -2461,6 +2461,28 @@ class TestResolveRunParamsUserId:
|
|||||||
assert run_context["user_id"] != raw
|
assert run_context["user_id"] != raw
|
||||||
assert run_context["channel_user_id"] == raw
|
assert run_context["channel_user_id"] == raw
|
||||||
|
|
||||||
|
def test_unsafe_user_id_migrates_unique_legacy_bucket(self, tmp_path, monkeypatch):
|
||||||
|
from deerflow.config.paths import Paths, make_safe_user_id
|
||||||
|
|
||||||
|
paths = Paths(tmp_path)
|
||||||
|
legacy_dir = paths.base_dir / "users" / "user-example-com-63a710569261a24b"
|
||||||
|
legacy_dir.mkdir(parents=True)
|
||||||
|
(legacy_dir / "memory.json").write_text('{"legacy": true}\n', encoding="utf-8")
|
||||||
|
monkeypatch.setattr("deerflow.config.paths.get_paths", lambda: paths)
|
||||||
|
|
||||||
|
manager = self._manager()
|
||||||
|
monkeypatch.delenv("DEER_FLOW_AUTH_DISABLED", raising=False)
|
||||||
|
raw = "user@example.com"
|
||||||
|
msg = InboundMessage(channel_name="feishu", chat_id="c", user_id=raw, text="hi")
|
||||||
|
|
||||||
|
_, _, run_context = manager._resolve_run_params(msg, "thread-1")
|
||||||
|
|
||||||
|
safe = make_safe_user_id(raw)
|
||||||
|
assert run_context["user_id"] == safe
|
||||||
|
assert paths.user_dir(safe).exists()
|
||||||
|
assert not legacy_dir.exists()
|
||||||
|
assert (paths.user_dir(safe) / "memory.json").read_text(encoding="utf-8") == '{"legacy": true}\n'
|
||||||
|
|
||||||
@pytest.mark.parametrize("raw_user_id", ["", None])
|
@pytest.mark.parametrize("raw_user_id", ["", None])
|
||||||
def test_empty_or_none_user_id_is_not_injected(self, raw_user_id, monkeypatch):
|
def test_empty_or_none_user_id_is_not_injected(self, raw_user_id, monkeypatch):
|
||||||
manager = self._manager()
|
manager = self._manager()
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
"""Tests for user-scoped path resolution in Paths."""
|
"""Tests for user-scoped path resolution in Paths."""
|
||||||
|
|
||||||
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
@@ -44,7 +45,7 @@ class TestMakeSafeUserId:
|
|||||||
# Sanitized prefix plus a stable digest of the original.
|
# Sanitized prefix plus a stable digest of the original.
|
||||||
assert result.startswith("user-example-com-")
|
assert result.startswith("user-example-com-")
|
||||||
assert len(result.rsplit("-", 1)[1]) == 16
|
assert len(result.rsplit("-", 1)[1]) == 16
|
||||||
assert result == "user-example-com-63a710569261a24b"
|
assert result == "user-example-com-b4c9a289323b21a0"
|
||||||
assert make_safe_user_id("user@example.com") == result
|
assert make_safe_user_id("user@example.com") == result
|
||||||
|
|
||||||
def test_sanitized_id_passes_validation(self, paths: Paths):
|
def test_sanitized_id_passes_validation(self, paths: Paths):
|
||||||
@@ -70,6 +71,37 @@ class TestUserDir:
|
|||||||
def test_user_dir(self, paths: Paths):
|
def test_user_dir(self, paths: Paths):
|
||||||
assert paths.user_dir("alice") == paths.base_dir / "users" / "alice"
|
assert paths.user_dir("alice") == paths.base_dir / "users" / "alice"
|
||||||
|
|
||||||
|
def test_prepare_user_dir_migrates_unique_legacy_unsafe_bucket(self, paths: Paths):
|
||||||
|
from deerflow.config.paths import make_safe_user_id
|
||||||
|
|
||||||
|
raw = "user@example.com"
|
||||||
|
safe = make_safe_user_id(raw)
|
||||||
|
legacy_dir = paths.base_dir / "users" / "user-example-com-63a710569261a24b"
|
||||||
|
legacy_dir.mkdir(parents=True)
|
||||||
|
(legacy_dir / "memory.json").write_text('{"legacy": true}\n', encoding="utf-8")
|
||||||
|
|
||||||
|
assert paths.prepare_user_dir_for_raw_id(raw) == safe
|
||||||
|
|
||||||
|
current_dir = paths.user_dir(safe)
|
||||||
|
assert current_dir.exists()
|
||||||
|
assert not legacy_dir.exists()
|
||||||
|
assert (current_dir / "memory.json").read_text(encoding="utf-8") == '{"legacy": true}\n'
|
||||||
|
|
||||||
|
def test_prepare_user_dir_skips_ambiguous_legacy_unsafe_buckets(self, paths: Paths, caplog):
|
||||||
|
from deerflow.config.paths import make_safe_user_id
|
||||||
|
|
||||||
|
users_dir = paths.base_dir / "users"
|
||||||
|
(users_dir / "a-b-1111111111111111").mkdir(parents=True)
|
||||||
|
(users_dir / "a-b-2222222222222222").mkdir(parents=True)
|
||||||
|
|
||||||
|
caplog.set_level(logging.WARNING)
|
||||||
|
assert paths.prepare_user_dir_for_raw_id("a.b") == make_safe_user_id("a.b")
|
||||||
|
|
||||||
|
assert not paths.user_dir(make_safe_user_id("a.b")).exists()
|
||||||
|
assert (users_dir / "a-b-1111111111111111").exists()
|
||||||
|
assert (users_dir / "a-b-2222222222222222").exists()
|
||||||
|
assert any("Multiple legacy unsafe-id user directories matched" in r.message for r in caplog.records)
|
||||||
|
|
||||||
|
|
||||||
class TestUserMemoryFile:
|
class TestUserMemoryFile:
|
||||||
def test_user_memory_file(self, paths: Paths):
|
def test_user_memory_file(self, paths: Paths):
|
||||||
|
|||||||
Reference in New Issue
Block a user