trajdl.datasets.modules.abstract module#

class trajdl.datasets.modules.abstract.BaseLocSeqDataModule(tokenizer: str | AbstractTokenizer, train_parquet_path: str | None = None, val_parquet_path: str | None = None, test_parquet_path: str | None = None, train_table: Table | DataFrame | DataFrame | BaseArrowDataset | None = None, val_table: Table | DataFrame | DataFrame | BaseArrowDataset | None = None, test_table: Table | DataFrame | DataFrame | BaseArrowDataset | None = None, train_batch_size: int = 2, val_batch_size: int = 2, train_sampler: Sampler | None = None, val_sampler: Sampler | None = None, num_cpus: int = 0)[source]#

Bases: BaseSeqDataModule

Abstract class for location sequence data modules.

init_from_parquet(path: str) LocSeqDataset[source]#

Load a location sequence dataset from a parquet file.

Parameters:

path (str) – Path to the parquet file.

Returns:

The loaded location sequence dataset.

Return type:

LocSeqDataset

init_from_table(table: Table | DataFrame | DataFrame | BaseArrowDataset) LocSeqDataset[source]#

Load a location sequence dataset from a pyarrow table, a pandas/polars DataFrame or a BaseArrowDataset.

Parameters:

table (Union[pa.Table, pl.DataFrame, pd.DataFrame, BaseArrowDataset]) – The input table or DataFrame.

Returns:

The loaded location sequence dataset.

Return type:

LocSeqDataset

class trajdl.datasets.modules.abstract.BaseSeqDataModule(tokenizer: str | AbstractTokenizer, train_parquet_path: str | None = None, val_parquet_path: str | None = None, test_parquet_path: str | None = None, train_table: Table | DataFrame | DataFrame | BaseArrowDataset | None = None, val_table: Table | DataFrame | DataFrame | BaseArrowDataset | None = None, test_table: Table | DataFrame | DataFrame | BaseArrowDataset | None = None, train_batch_size: int = 2, val_batch_size: int = 2, train_sampler: Sampler | None = None, val_sampler: Sampler | None = None, num_cpus: int = 0)[source]#

Bases: LightningDataModule, ABC

Abstract class for sequence data modules.

Parameters:
  • tokenizer (Union[str, AbstractTokenizer]) – Path of tokenizer or tokenizer instance.

  • train_parquet_path (str, optional) – Path to the training parquet file.

  • val_parquet_path (str, optional) – Path to the validation parquet file.

  • test_parquet_path (str, optional) – Path to the test parquet file.

  • train_table (Union[pa.Table, pl.DataFrame, pd.DataFrame, BaseArrowDataset, None], optional) – DataFrame for training that will be transformed into a pyarrow.Table automatically.

  • val_table (Union[pa.Table, pl.DataFrame, pd.DataFrame, BaseArrowDataset, None], optional) – DataFrame for validation that will be transformed into a pyarrow.Table automatically.

  • test_table (Union[pa.Table, pl.DataFrame, pd.DataFrame, BaseArrowDataset, None], optional) – DataFrame for testing that will be transformed into a pyarrow.Table automatically.

  • train_batch_size (int, optional) – Batch size for training.

  • val_batch_size (int, optional) – Batch size for validation.

  • train_sampler (Union[Sampler, None], optional) – Custom sampler for training.

  • val_sampler (Union[Sampler, None], optional) – Custom sampler for validation.

  • num_cpus (int, optional) – Number of CPUs to use.

abstract collate_function(ds: BaseArrowDataset)[source]#

Collate function for the dataset. Different modules may require different implementations.

Parameters:

ds (BaseArrowDataset) – The dataset to collate.

Returns:

The collated data.

Return type:

Any

abstract init_from_parquet(path: str) BaseArrowDataset[source]#

Load an Arrow dataset from a parquet file.

Parameters:

path (str) – Path to the parquet file.

Returns:

The loaded Arrow dataset.

Return type:

BaseArrowDataset

abstract init_from_table(table: Table | DataFrame | DataFrame | BaseArrowDataset) BaseArrowDataset[source]#

Load a dataset from a pyarrow table or pandas/polars DataFrame.

Parameters:

table (Union[pa.Table, pl.DataFrame, pd.DataFrame]) – The input table or DataFrame.

Returns:

The loaded Arrow dataset.

Return type:

BaseArrowDataset

num_cpus: int = 0#
setup(stage: str)[source]#

Set up the data module, loading the tokenizer and initializing datasets.

Parameters:

stage (str) – Stage of operation (e.g. β€˜fit’, β€˜test’).

test_dataloader()[source]#

Create the test data loader.

Returns:

The data loader for the test dataset.

Return type:

DataLoader

test_parquet_path: str | None = None#
test_table: Table | DataFrame | DataFrame | BaseArrowDataset | None = None#
tokenizer: str | AbstractTokenizer#
train_batch_size: int = 2#
train_dataloader()[source]#

Create the training data loader.

Returns:

The data loader for the training dataset.

Return type:

DataLoader

train_parquet_path: str | None = None#
train_sampler: Sampler | None = None#
train_table: Table | DataFrame | DataFrame | BaseArrowDataset | None = None#
val_batch_size: int = 2#
val_dataloader()[source]#

Create the validation data loader.

Returns:

The data loader for the validation dataset.

Return type:

DataLoader

val_parquet_path: str | None = None#
val_sampler: Sampler | None = None#
val_table: Table | DataFrame | DataFrame | BaseArrowDataset | None = None#
class trajdl.datasets.modules.abstract.BaseTrajectoryDataModule(tokenizer: str | AbstractTokenizer, train_parquet_path: str | None = None, val_parquet_path: str | None = None, test_parquet_path: str | None = None, train_table: Table | DataFrame | DataFrame | BaseArrowDataset | None = None, val_table: Table | DataFrame | DataFrame | BaseArrowDataset | None = None, test_table: Table | DataFrame | DataFrame | BaseArrowDataset | None = None, train_batch_size: int = 2, val_batch_size: int = 2, train_sampler: Sampler | None = None, val_sampler: Sampler | None = None, num_cpus: int = 0)[source]#

Bases: BaseSeqDataModule

Abstract class for trajectory sequence data modules.

init_from_parquet(path: str) TrajectoryDataset[source]#

Load a trajectory dataset from a parquet file.

Parameters:

path (str) – Path to the parquet file.

Returns:

The loaded trajectory dataset.

Return type:

TrajectoryDataset

init_from_table(table: Table | DataFrame | DataFrame | BaseArrowDataset) TrajectoryDataset[source]#

Load a trajectory dataset from a pyarrow table, a pandas/polars DataFrame or a BaseArrowDataset.

Parameters:

table (Union[pa.Table, pl.DataFrame, pd.DataFrame, BaseArrowDataset]) – The input table or DataFrame.

Returns:

The loaded trajectory dataset.

Return type:

TrajectoryDataset