Artifact Archive

MinioArchive implements S3-compatible artifact and corpus support.

This API is important for:

moving local corpus payloads into remotely readable object storage
task-scoped output references
signed URL generation
fallback behavior between local file paths and remote object locators

`lingo.bucket.archive`

S3-compatible archive for storage and retrieval.

`MinioArchive`

Bases: Archive

S3-compatible implementation of the Archive interface.

Source code in lingo/bucket/archive.py

class MinioArchive(Archive):
    """S3-compatible implementation of the Archive interface."""

    def __init__(
        self,
        bucket: str = "default",
        prefix: str = "",
        endpoint: Optional[str] = None,
        access_key: Optional[str] = None,
        secret_key: Optional[str] = None,
        secure: bool = False,
        root_dir: Optional[str] = None,
    ):
        self.bucket = bucket
        self.prefix = prefix
        self.endpoint = (endpoint or os.getenv("MINIO_ENDPOINT", "http://localhost:9000")).rstrip("/")
        self.access_key = access_key or os.getenv("MINIO_ACCESS_KEY") or os.getenv("MINIO_ROOT_USER") or "minioadmin"
        self.secret_key = secret_key or os.getenv("MINIO_SECRET_KEY") or os.getenv("MINIO_ROOT_PASSWORD") or "minioadmin"
        self.secure = secure
        self._endpoint_url = self._normalize_endpoint_url(self.endpoint, secure=self.secure)
        base = Path(root_dir) if root_dir else Path(tempfile.gettempdir()) / "lingo_archive"
        self.root_dir = base / bucket
        self.root_dir.mkdir(parents=True, exist_ok=True)
        self._s3_client = None
        # Backward-compatible alias for tests or integrations still patching this attribute.
        self._minio_client = None

        try:
            self._s3_client = boto3.client(
                "s3",
                endpoint_url=self._endpoint_url,
                aws_access_key_id=self.access_key,
                aws_secret_access_key=self.secret_key,
            )
            self._minio_client = self._s3_client
            try:
                self._s3_client.head_bucket(Bucket=self.bucket)
            except ClientError as exc:
                error_code = exc.response.get("Error", {}).get("Code", "")
                if error_code in {"404", "NoSuchBucket", "NotFound"}:
                    self._s3_client.create_bucket(Bucket=self.bucket)
                else:
                    raise
        except Exception:
            print(f'S3 client failed to start with endpoint "{self._endpoint_url}".')
            raise

    @staticmethod
    def _normalize_endpoint_url(endpoint: str, *, secure: bool) -> str:
        parsed = urlparse(endpoint)
        if parsed.scheme:
            return endpoint.rstrip("/")
        scheme = "https" if secure else "http"
        return f"{scheme}://{endpoint.strip('/')}"

    @property
    def normalized_prefix(self) -> str:
        value = (self.prefix or "").strip("/")
        return f"{value}/" if value else ""

    @property
    def _endpoint_host(self) -> str:
        parsed = urlparse(self._endpoint_url)
        return parsed.netloc.split("@")[-1]

    def has_upload_client(self) -> bool:
        return self._s3_client is not None

    @staticmethod
    def _extension_for_content_type(content_type: Optional[str]) -> str:
        if not content_type:
            return ""
        ext = mimetypes.guess_extension(content_type, strict=False)
        return ext or ""

    def _build_object_key(
        self,
        *,
        path: Optional[str] = None,
        task_id: Optional[str] = None,
        content_type: Optional[str] = None,
    ) -> str:
        if path:
            clean = path.lstrip("/")
            if task_id:
                return f"{self.normalized_prefix}{task_id}/{clean}" if self.normalized_prefix else f"{task_id}/{clean}"
            return f"{self.normalized_prefix}{clean}" if self.normalized_prefix else clean

        generated_name = f"{uuid.uuid4().hex}{self._extension_for_content_type(content_type)}"
        if task_id:
            return f"{self.normalized_prefix}{task_id}/{generated_name}" if self.normalized_prefix else f"{task_id}/{generated_name}"
        return f"{self.normalized_prefix}{generated_name}" if self.normalized_prefix else generated_name

    def _local_path_from_object_key(self, object_key: str) -> Path:
        return self.root_dir / object_key.lstrip("/")

    def _resolve_path(self, relative_path: str) -> Path:
        key = self._build_object_key(path=relative_path)
        safe_key = key.lstrip("/")
        return self.root_dir / safe_key

    def _parse_minio_locator(self, locator: str) -> Optional[tuple[str, str]]:
        parsed = urlparse(locator)
        if parsed.scheme in {"s3", "minio"} and parsed.netloc and parsed.path:
            return parsed.netloc, parsed.path.lstrip("/")

        if parsed.scheme in {"http", "https"} and parsed.netloc:
            endpoint_host = self._endpoint_host
            locator_host = parsed.netloc.split("@")[-1]
            if locator_host != endpoint_host:
                return None
            path = parsed.path.lstrip("/")
            if not path or "/" not in path:
                return None
            bucket, object_key = path.split("/", 1)
            return bucket, object_key

        return None

    def _public_url_for_object(self, bucket: str, object_key: str) -> str:
        if self._s3_client is not None:
            return self._s3_client.generate_presigned_url(
                "get_object",
                Params={"Bucket": bucket, "Key": object_key},
                ExpiresIn=24 * 60 * 60,
            )
        return f"{self._endpoint_url}/{bucket}/{object_key}"

    def _object_key_from_reference(self, reference: Reference[Any]) -> str:
        if reference.object_key:
            return reference.object_key.lstrip("/")
        ref_path = Path(reference.path)
        try:
            return ref_path.relative_to(self.root_dir).as_posix()
        except ValueError:
            return ref_path.name

    async def create_task_reference(
        self,
        *,
        task_id: str,
        content_type: Optional[str] = None,
        path_hint: Optional[str] = None,
    ) -> Reference[bytes]:
        object_key = self._build_object_key(path=path_hint, task_id=task_id, content_type=content_type)
        local_path = self._local_path_from_object_key(object_key)
        return Reference.from_path(local_path, content_type=content_type, object_key=object_key)

    async def prepare_corpus_for_dispatch(self, corpus: Corpus[Any], *, task_id: str) -> Corpus[Any]:
        """Ensure local corpus payload is remotely readable for distributed dispatch."""
        source_path = corpus.source_path
        if not source_path:
            return corpus

        local_path = Path(source_path)
        if not local_path.exists():
            return corpus

        reference = await self.create_task_reference(
            task_id=task_id,
            content_type=corpus.content_type,
            path_hint=local_path.name,
        )
        reference.dump_file(local_path, content_type=corpus.content_type)
        await self.ensure_reference_uploaded(reference)
        public_url = await self.get_public_url(reference)
        return Corpus[Any](
            content_type=corpus.content_type,
            source_url=public_url,
            object_data=corpus.object_data,
            bytes_data=None,
            source_path=None,
        )

    async def corpus_from_minio(
        self,
        *,
        object_key: str,
        bucket: Optional[str] = None,
        content_type: Optional[str] = None,
        copy_to_task: bool = False,
        task_id: Optional[str] = None,
    ) -> Corpus[bytes]:
        """Create a corpus pointing to an existing S3-compatible object.

        By default this does not copy data into `{prefix}/{task_id}`. Set
        `copy_to_task=True` to materialize a task-scoped duplicate.
        """
        source_bucket = bucket or self.bucket
        source_key = object_key.lstrip("/")

        if not copy_to_task:
            return Corpus[bytes](
                content_type=content_type,
                source_url=self._public_url_for_object(source_bucket, source_key),
            )

        if not task_id:
            raise ValueError("task_id is required when copy_to_task=True")
        if self._s3_client is None:
            raise RuntimeError("copy_to_task requires an initialized S3 client")

        target_ref = await self.create_task_reference(
            task_id=task_id,
            content_type=content_type,
            path_hint=Path(source_key).name,
        )
        target_key = target_ref.object_key or self._object_key_from_reference(target_ref)

        try:
            self._s3_client.copy_object(
                Bucket=self.bucket,
                Key=target_key,
                CopySource={"Bucket": source_bucket, "Key": source_key},
            )
        except Exception:
            source_url = self._public_url_for_object(source_bucket, source_key)
            with urlopen(source_url, timeout=20) as response:
                data = response.read()
            target_ref.dump_content(data, content_type=content_type)
            await self.ensure_reference_uploaded(target_ref)

        return Corpus[bytes](
            content_type=content_type,
            source_url=await self.get_public_url(target_ref),
        )

    async def ensure_reference_uploaded(self, reference: Reference[Any]) -> None:
        """Upload local reference file to S3-compatible storage when configured."""
        if self._s3_client is None:
            return

        local_path = Path(reference.path)
        if not local_path.exists():
            return

        object_key = self._object_key_from_reference(reference)
        extra_args = {"ContentType": reference.content_type or "application/octet-stream"}
        self._s3_client.upload_file(
            str(local_path),
            self.bucket,
            object_key,
            ExtraArgs=extra_args,
        )

    async def corpus_from_url(
        self,
        url: str,
        content_type: Optional[str] = None,
    ) -> Corpus[bytes]:
        """Load a corpus from a URL."""
        local_candidate = Path(url)
        if local_candidate.exists():
            return Corpus.from_file(local_candidate, content_type=content_type)

        parsed = urlparse(url)
        if parsed.scheme in ("", "file"):
            local_path = Path(parsed.path if parsed.scheme == "file" else url)
            return Corpus.from_file(local_path, content_type=content_type)

        locator = self._parse_minio_locator(url)
        if locator is not None:
            bucket, object_key = locator
            return await self.corpus_from_minio(
                bucket=bucket,
                object_key=object_key,
                content_type=content_type,
                copy_to_task=False,
            )

        with urlopen(url, timeout=10) as response:
            data = response.read()
        return Corpus[bytes](content_type=content_type, bytes_data=data)

    async def get_reference(
        self,
        path: Optional[str] = None,
        content_type: Optional[str] = None,
        *,
        task_id: Optional[str] = None,
    ) -> Reference[bytes]:
        """Get a reference to a storage location."""
        object_key = self._build_object_key(path=path, task_id=task_id, content_type=content_type)
        full_path = self._local_path_from_object_key(object_key)
        return Reference.from_path(full_path, content_type=content_type, object_key=object_key)

    async def get_public_url(self, reference: Reference[Any]) -> str:
        """Get a public URL for a reference."""
        object_key = self._object_key_from_reference(reference)
        return self._public_url_for_object(self.bucket, object_key)

`corpus_from_minio(*, object_key: str, bucket: Optional[str] = None, content_type: Optional[str] = None, copy_to_task: bool = False, task_id: Optional[str] = None) -> Corpus[bytes]` `async`

Create a corpus pointing to an existing S3-compatible object.

By default this does not copy data into {prefix}/{task_id}. Set copy_to_task=True to materialize a task-scoped duplicate.

Source code in lingo/bucket/archive.py

async def corpus_from_minio(
    self,
    *,
    object_key: str,
    bucket: Optional[str] = None,
    content_type: Optional[str] = None,
    copy_to_task: bool = False,
    task_id: Optional[str] = None,
) -> Corpus[bytes]:
    """Create a corpus pointing to an existing S3-compatible object.

    By default this does not copy data into `{prefix}/{task_id}`. Set
    `copy_to_task=True` to materialize a task-scoped duplicate.
    """
    source_bucket = bucket or self.bucket
    source_key = object_key.lstrip("/")

    if not copy_to_task:
        return Corpus[bytes](
            content_type=content_type,
            source_url=self._public_url_for_object(source_bucket, source_key),
        )

    if not task_id:
        raise ValueError("task_id is required when copy_to_task=True")
    if self._s3_client is None:
        raise RuntimeError("copy_to_task requires an initialized S3 client")

    target_ref = await self.create_task_reference(
        task_id=task_id,
        content_type=content_type,
        path_hint=Path(source_key).name,
    )
    target_key = target_ref.object_key or self._object_key_from_reference(target_ref)

    try:
        self._s3_client.copy_object(
            Bucket=self.bucket,
            Key=target_key,
            CopySource={"Bucket": source_bucket, "Key": source_key},
        )
    except Exception:
        source_url = self._public_url_for_object(source_bucket, source_key)
        with urlopen(source_url, timeout=20) as response:
            data = response.read()
        target_ref.dump_content(data, content_type=content_type)
        await self.ensure_reference_uploaded(target_ref)

    return Corpus[bytes](
        content_type=content_type,
        source_url=await self.get_public_url(target_ref),
    )

`corpus_from_url(url: str, content_type: Optional[str] = None) -> Corpus[bytes]` `async`

Load a corpus from a URL.

Source code in lingo/bucket/archive.py

async def corpus_from_url(
    self,
    url: str,
    content_type: Optional[str] = None,
) -> Corpus[bytes]:
    """Load a corpus from a URL."""
    local_candidate = Path(url)
    if local_candidate.exists():
        return Corpus.from_file(local_candidate, content_type=content_type)

    parsed = urlparse(url)
    if parsed.scheme in ("", "file"):
        local_path = Path(parsed.path if parsed.scheme == "file" else url)
        return Corpus.from_file(local_path, content_type=content_type)

    locator = self._parse_minio_locator(url)
    if locator is not None:
        bucket, object_key = locator
        return await self.corpus_from_minio(
            bucket=bucket,
            object_key=object_key,
            content_type=content_type,
            copy_to_task=False,
        )

    with urlopen(url, timeout=10) as response:
        data = response.read()
    return Corpus[bytes](content_type=content_type, bytes_data=data)

`ensure_reference_uploaded(reference: Reference[Any]) -> None` `async`

Upload local reference file to S3-compatible storage when configured.

Source code in lingo/bucket/archive.py

async def ensure_reference_uploaded(self, reference: Reference[Any]) -> None:
    """Upload local reference file to S3-compatible storage when configured."""
    if self._s3_client is None:
        return

    local_path = Path(reference.path)
    if not local_path.exists():
        return

    object_key = self._object_key_from_reference(reference)
    extra_args = {"ContentType": reference.content_type or "application/octet-stream"}
    self._s3_client.upload_file(
        str(local_path),
        self.bucket,
        object_key,
        ExtraArgs=extra_args,
    )

`get_public_url(reference: Reference[Any]) -> str` `async`

Get a public URL for a reference.

Source code in lingo/bucket/archive.py

async def get_public_url(self, reference: Reference[Any]) -> str:
    """Get a public URL for a reference."""
    object_key = self._object_key_from_reference(reference)
    return self._public_url_for_object(self.bucket, object_key)

`get_reference(path: Optional[str] = None, content_type: Optional[str] = None, *, task_id: Optional[str] = None) -> Reference[bytes]` `async`

Get a reference to a storage location.

Source code in lingo/bucket/archive.py

async def get_reference(
    self,
    path: Optional[str] = None,
    content_type: Optional[str] = None,
    *,
    task_id: Optional[str] = None,
) -> Reference[bytes]:
    """Get a reference to a storage location."""
    object_key = self._build_object_key(path=path, task_id=task_id, content_type=content_type)
    full_path = self._local_path_from_object_key(object_key)
    return Reference.from_path(full_path, content_type=content_type, object_key=object_key)

`prepare_corpus_for_dispatch(corpus: Corpus[Any], *, task_id: str) -> Corpus[Any]` `async`

Ensure local corpus payload is remotely readable for distributed dispatch.

Source code in lingo/bucket/archive.py

async def prepare_corpus_for_dispatch(self, corpus: Corpus[Any], *, task_id: str) -> Corpus[Any]:
    """Ensure local corpus payload is remotely readable for distributed dispatch."""
    source_path = corpus.source_path
    if not source_path:
        return corpus

    local_path = Path(source_path)
    if not local_path.exists():
        return corpus

    reference = await self.create_task_reference(
        task_id=task_id,
        content_type=corpus.content_type,
        path_hint=local_path.name,
    )
    reference.dump_file(local_path, content_type=corpus.content_type)
    await self.ensure_reference_uploaded(reference)
    public_url = await self.get_public_url(reference)
    return Corpus[Any](
        content_type=corpus.content_type,
        source_url=public_url,
        object_data=corpus.object_data,
        bytes_data=None,
        source_path=None,
    )

Artifact Archive

lingo.bucket.archive

MinioArchive

corpus_from_minio(*, object_key: str, bucket: Optional[str] = None, content_type: Optional[str] = None, copy_to_task: bool = False, task_id: Optional[str] = None) -> Corpus[bytes] async

corpus_from_url(url: str, content_type: Optional[str] = None) -> Corpus[bytes] async

ensure_reference_uploaded(reference: Reference[Any]) -> None async

get_public_url(reference: Reference[Any]) -> str async

get_reference(path: Optional[str] = None, content_type: Optional[str] = None, *, task_id: Optional[str] = None) -> Reference[bytes] async

prepare_corpus_for_dispatch(corpus: Corpus[Any], *, task_id: str) -> Corpus[Any] async

`lingo.bucket.archive`

`MinioArchive`

`corpus_from_minio(*, object_key: str, bucket: Optional[str] = None, content_type: Optional[str] = None, copy_to_task: bool = False, task_id: Optional[str] = None) -> Corpus[bytes]` `async`

`corpus_from_url(url: str, content_type: Optional[str] = None) -> Corpus[bytes]` `async`

`ensure_reference_uploaded(reference: Reference[Any]) -> None` `async`

`get_public_url(reference: Reference[Any]) -> str` `async`

`get_reference(path: Optional[str] = None, content_type: Optional[str] = None, *, task_id: Optional[str] = None) -> Reference[bytes]` `async`

`prepare_corpus_for_dispatch(corpus: Corpus[Any], *, task_id: str) -> Corpus[Any]` `async`