Skip to content

Corpus Utilities

lingo.corpus

Generic Corpus and Reference types for data payloads.

Corpus

Bases: BaseModel, Generic[T]

Represents a distinct data payload managed by the Archive.

Source code in lingo/corpus.py
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
class Corpus(BaseModel, Generic[T]):
    """Represents a distinct data payload managed by the Archive."""

    model_config = ConfigDict(arbitrary_types_allowed=True)

    content_type: Optional[str] = None
    source_path: Optional[str] = None
    source_url: Optional[str] = None
    object_data: Optional[Any] = None
    bytes_data: Optional[bytes] = None

    @classmethod
    def from_file(cls, path: Union[str, Path], content_type: Optional[str] = None) -> Corpus[bytes]:
        """Load a corpus from a file."""
        file_path = Path(path)
        return cls(
            content_type=content_type,
            source_path=str(file_path),
            bytes_data=file_path.read_bytes(),
        )

    @classmethod
    def from_object(cls, obj: T) -> Corpus[T]:
        """Create a corpus from a Python object."""
        if isinstance(obj, bytes):
            return cls(bytes_data=obj)
        if isinstance(obj, str):
            return cls(object_data=obj, content_type="text/plain")
        return cls(object_data=obj)

    def materialize_to_file(self, path: Union[str, Path]) -> Path:
        """Materialize corpus contents to a file."""
        out_path = Path(path)
        out_path.parent.mkdir(parents=True, exist_ok=True)
        out_path.write_bytes(self.materialize_content())
        return out_path

    def materialize_to_object(self) -> T:
        """Materialize corpus contents to a Python object."""
        if self.object_data is not None:
            return self.object_data
        data = self.bytes_data
        if data is None and self.source_path:
            path = Path(self.source_path)
            if path.exists():
                data = path.read_bytes()
        if data is None and self.source_url:
            with urlopen(self.source_url, timeout=20) as response:
                data = response.read()
        if data is None:
            return None

        if self.content_type == "application/json":
            return json.loads(data.decode())

        try:
            return data.decode()
        except UnicodeDecodeError:
            return data

    def materialize_content(self) -> bytes:
        """Materialize corpus to raw bytes."""
        if self.bytes_data is not None:
            return self.bytes_data

        if self.source_path:
            parsed = urlparse(self.source_path)
            # Windows drive-letter paths look like a URI scheme (e.g. C:\\...)
            # so only treat explicit web/file schemes as remote URLs.
            if parsed.scheme in {"http", "https", "file"}:
                with urlopen(self.source_path, timeout=20) as response:
                    return response.read()
            path = Path(self.source_path)
            if path.exists():
                return path.read_bytes()

        if self.source_url:
            with urlopen(self.source_url, timeout=20) as response:
                return response.read()

        if self.object_data is None:
            return b""

        if isinstance(self.object_data, bytes):
            return self.object_data
        if isinstance(self.object_data, str):
            return self.object_data.encode()
        return json.dumps(self.object_data).encode()

from_file(path: Union[str, Path], content_type: Optional[str] = None) -> Corpus[bytes] classmethod

Load a corpus from a file.

Source code in lingo/corpus.py
26
27
28
29
30
31
32
33
34
@classmethod
def from_file(cls, path: Union[str, Path], content_type: Optional[str] = None) -> Corpus[bytes]:
    """Load a corpus from a file."""
    file_path = Path(path)
    return cls(
        content_type=content_type,
        source_path=str(file_path),
        bytes_data=file_path.read_bytes(),
    )

from_object(obj: T) -> Corpus[T] classmethod

Create a corpus from a Python object.

Source code in lingo/corpus.py
36
37
38
39
40
41
42
43
@classmethod
def from_object(cls, obj: T) -> Corpus[T]:
    """Create a corpus from a Python object."""
    if isinstance(obj, bytes):
        return cls(bytes_data=obj)
    if isinstance(obj, str):
        return cls(object_data=obj, content_type="text/plain")
    return cls(object_data=obj)

materialize_content() -> bytes

Materialize corpus to raw bytes.

Source code in lingo/corpus.py
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
def materialize_content(self) -> bytes:
    """Materialize corpus to raw bytes."""
    if self.bytes_data is not None:
        return self.bytes_data

    if self.source_path:
        parsed = urlparse(self.source_path)
        # Windows drive-letter paths look like a URI scheme (e.g. C:\\...)
        # so only treat explicit web/file schemes as remote URLs.
        if parsed.scheme in {"http", "https", "file"}:
            with urlopen(self.source_path, timeout=20) as response:
                return response.read()
        path = Path(self.source_path)
        if path.exists():
            return path.read_bytes()

    if self.source_url:
        with urlopen(self.source_url, timeout=20) as response:
            return response.read()

    if self.object_data is None:
        return b""

    if isinstance(self.object_data, bytes):
        return self.object_data
    if isinstance(self.object_data, str):
        return self.object_data.encode()
    return json.dumps(self.object_data).encode()

materialize_to_file(path: Union[str, Path]) -> Path

Materialize corpus contents to a file.

Source code in lingo/corpus.py
45
46
47
48
49
50
def materialize_to_file(self, path: Union[str, Path]) -> Path:
    """Materialize corpus contents to a file."""
    out_path = Path(path)
    out_path.parent.mkdir(parents=True, exist_ok=True)
    out_path.write_bytes(self.materialize_content())
    return out_path

materialize_to_object() -> T

Materialize corpus contents to a Python object.

Source code in lingo/corpus.py
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
def materialize_to_object(self) -> T:
    """Materialize corpus contents to a Python object."""
    if self.object_data is not None:
        return self.object_data
    data = self.bytes_data
    if data is None and self.source_path:
        path = Path(self.source_path)
        if path.exists():
            data = path.read_bytes()
    if data is None and self.source_url:
        with urlopen(self.source_url, timeout=20) as response:
            data = response.read()
    if data is None:
        return None

    if self.content_type == "application/json":
        return json.loads(data.decode())

    try:
        return data.decode()
    except UnicodeDecodeError:
        return data

Reference

Bases: BaseModel, Generic[T]

A pointer to a specific location in the Archive where output should be saved.

Source code in lingo/corpus.py
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
class Reference(BaseModel, Generic[T]):
    """A pointer to a specific location in the Archive where output should be saved."""

    model_config = ConfigDict(arbitrary_types_allowed=True)

    content_type: Optional[str] = None
    path: str
    object_key: Optional[str] = None
    public_url: Optional[str] = None

    @classmethod
    def from_path(
        cls,
        path: Union[str, Path],
        content_type: Optional[str] = None,
        *,
        object_key: Optional[str] = None,
        public_url: Optional[str] = None,
    ) -> Reference[T]:
        """Create a reference pointing to a file path."""
        return cls(
            path=str(Path(path)),
            content_type=content_type,
            object_key=object_key,
            public_url=public_url,
        )

    def dump_corpus(self, corpus: Corpus[T]) -> Corpus[T]:
        """Save a corpus to this reference location."""
        target = Path(self.path)
        target.parent.mkdir(parents=True, exist_ok=True)
        target.write_bytes(corpus.materialize_content())
        return Corpus.from_file(target, content_type=self.content_type or corpus.content_type)

    def dump_file(self, file_path: Union[str, Path], content_type: Optional[str] = None) -> Corpus[bytes]:
        """Copy a file to this reference location."""
        source = Path(file_path)
        target = Path(self.path)
        target.parent.mkdir(parents=True, exist_ok=True)
        target.write_bytes(source.read_bytes())
        return Corpus.from_file(target, content_type=content_type or self.content_type)

    def dump_content(self, content: bytes, content_type: Optional[str] = None) -> Corpus[bytes]:
        """Save raw content to this reference location."""
        target = Path(self.path)
        target.parent.mkdir(parents=True, exist_ok=True)
        target.write_bytes(content)
        return Corpus.from_file(target, content_type=content_type or self.content_type)

dump_content(content: bytes, content_type: Optional[str] = None) -> Corpus[bytes]

Save raw content to this reference location.

Source code in lingo/corpus.py
147
148
149
150
151
152
def dump_content(self, content: bytes, content_type: Optional[str] = None) -> Corpus[bytes]:
    """Save raw content to this reference location."""
    target = Path(self.path)
    target.parent.mkdir(parents=True, exist_ok=True)
    target.write_bytes(content)
    return Corpus.from_file(target, content_type=content_type or self.content_type)

dump_corpus(corpus: Corpus[T]) -> Corpus[T]

Save a corpus to this reference location.

Source code in lingo/corpus.py
132
133
134
135
136
137
def dump_corpus(self, corpus: Corpus[T]) -> Corpus[T]:
    """Save a corpus to this reference location."""
    target = Path(self.path)
    target.parent.mkdir(parents=True, exist_ok=True)
    target.write_bytes(corpus.materialize_content())
    return Corpus.from_file(target, content_type=self.content_type or corpus.content_type)

dump_file(file_path: Union[str, Path], content_type: Optional[str] = None) -> Corpus[bytes]

Copy a file to this reference location.

Source code in lingo/corpus.py
139
140
141
142
143
144
145
def dump_file(self, file_path: Union[str, Path], content_type: Optional[str] = None) -> Corpus[bytes]:
    """Copy a file to this reference location."""
    source = Path(file_path)
    target = Path(self.path)
    target.parent.mkdir(parents=True, exist_ok=True)
    target.write_bytes(source.read_bytes())
    return Corpus.from_file(target, content_type=content_type or self.content_type)

from_path(path: Union[str, Path], content_type: Optional[str] = None, *, object_key: Optional[str] = None, public_url: Optional[str] = None) -> Reference[T] classmethod

Create a reference pointing to a file path.

Source code in lingo/corpus.py
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
@classmethod
def from_path(
    cls,
    path: Union[str, Path],
    content_type: Optional[str] = None,
    *,
    object_key: Optional[str] = None,
    public_url: Optional[str] = None,
) -> Reference[T]:
    """Create a reference pointing to a file path."""
    return cls(
        path=str(Path(path)),
        content_type=content_type,
        object_key=object_key,
        public_url=public_url,
    )