Skip to main content

Documentation Index

Fetch the complete documentation index at: https://mintlify.com/terrafloww/rasteret/llms.txt

Use this file to discover all available pages before exploring further.

Overview

Rasteret makes it easy to combine multiple satellite collections into unified training pipelines. This guide shows how to:
  • Build and merge collections from different data sources
  • Harmonize band names across datasets
  • Create cross-collection training splits
  • Build hybrid datasets (imagery + embeddings, multi-modal sources)

Building Multiple Collections

Start by building collections for each data source:
import rasteret
import pyarrow as pa
import pyarrow.compute as pc

bbox = (77.55, 13.01, 77.58, 13.08)
date_range = ("2024-01-01", "2024-06-30")

# Sentinel-2 optical imagery
s2_collection = rasteret.build(
    "earthsearch/sentinel-2-l2a",
    name="s2_bangalore",
    bbox=bbox,
    date_range=date_range,
)

# Landsat for temporal diversity
landsat_collection = rasteret.build(
    "earthsearch/landsat-c2-l2",
    name="landsat_bangalore",
    bbox=bbox,
    date_range=date_range,
)

# Foundation model embeddings
aef_collection = rasteret.build(
    "aef/v1-annual",
    name="aef_bangalore",
    bbox=bbox,
    date_range=date_range,
)

Merging Collections

Concatenate with PyArrow

Combine record tables and preserve all metadata columns:
import pyarrow.concat_tables as concat_tables

# Extract tables
s2_table = s2_collection.dataset.to_table()
landsat_table = landsat_collection.dataset.to_table()

# Align schemas by selecting common columns
common_cols = [
    "id", "datetime", "geometry", "assets", "scene_bbox",
    "proj:epsg", "eo:cloud_cover",
]

# Add source tags
s2_table = s2_table.append_column(
    "source", pa.array(["sentinel-2"] * len(s2_table))
)
landsat_table = landsat_table.append_column(
    "source", pa.array(["landsat"] * len(landsat_table))
)

# Concatenate
merged_table = pa.concat_tables([
    s2_table.select(common_cols + ["source"] + 
                     [c for c in s2_table.schema.names if c.endswith("_metadata")]),
    landsat_table.select(common_cols + ["source"] + 
                          [c for c in landsat_table.schema.names if c.endswith("_metadata")]),
])

# Wrap as a collection
merged_collection = rasteret.as_collection(
    merged_table,
    name="multi_source",
    data_source="sentinel-2+landsat",
)

print(f"Combined collection: {merged_collection.dataset.count_rows()} rows")

Band Harmonization

When combining datasets with different band naming conventions, create a unified band mapping:
# Sentinel-2 uses B04/B08, Landsat uses B4/B5 for red/NIR
# Harmonize at read time by subsetting per source

def get_multi_source_data(collection, geometries):
    """Fetch harmonized RGB+NIR from multiple sources."""
    s2_subset = collection.subset(source="sentinel-2")
    landsat_subset = collection.subset(source="landsat")
    
    # Fetch with source-specific band codes
    s2_data = s2_subset.get_numpy(
        geometries=geometries,
        bands=["B04", "B03", "B02", "B08"],  # Sentinel-2
    )
    
    landsat_data = landsat_subset.get_numpy(
        geometries=geometries,
        bands=["B4", "B3", "B2", "B5"],  # Landsat
    )
    
    # Both are now (N, 4, H, W) with [Red, Green, Blue, NIR]
    return s2_data, landsat_data

Cross-Collection Splits

Assign consistent train/val/test splits across all collections:
import numpy as np

def assign_global_splits(
    collections: list[rasteret.Collection],
    train_ratio: float = 0.7,
    val_ratio: float = 0.15,
    seed: int = 42,
) -> list[rasteret.Collection]:
    """Assign deterministic splits based on geometry hash."""
    import hashlib
    
    split_collections = []
    for coll in collections:
        table = coll.dataset.to_table()
        
        # Hash geometries for deterministic splits
        geoms = table.column("geometry").to_pylist()
        hashes = [int(hashlib.md5(g).hexdigest(), 16) for g in geoms]
        ratios = [h / (2**128) for h in hashes]  # Normalize to [0, 1]
        
        splits = np.where(
            np.array(ratios) < train_ratio,
            "train",
            np.where(
                np.array(ratios) < train_ratio + val_ratio,
                "val",
                "test",
            ),
        )
        
        table = table.append_column("split", pa.array(splits))
        split_collections.append(
            rasteret.as_collection(table, name=coll.name)
        )
    
    return split_collections

# Apply to all collections
s2_split, landsat_split, aef_split = assign_global_splits([
    s2_collection, landsat_collection, aef_collection
])
See examples/ml_training_with_splits.py for a complete workflow from src/rasteret/catalog.py:448.

Hybrid Datasets: Imagery + Embeddings

Combine raw imagery with foundation model embeddings:
from torch.utils.data import Dataset

class HybridDataset(Dataset):
    """Multi-modal dataset: Sentinel-2 RGB + AlphaEarth embeddings."""
    
    def __init__(self, s2_collection, aef_collection, geometries, chip_size=256):
        self.s2 = s2_collection.to_torchgeo_dataset(
            bands=["B04", "B03", "B02"],
            geometries=geometries,
            chip_size=chip_size,
        )
        self.aef = aef_collection.to_torchgeo_dataset(
            bands=[f"A{i:02d}" for i in range(64)],  # 64 embedding channels
            geometries=geometries,
            chip_size=chip_size,
        )
    
    def __len__(self):
        return len(self.s2)
    
    def __getitem__(self, idx):
        s2_sample = self.s2[idx]
        aef_sample = self.aef[idx]
        
        return {
            "rgb": s2_sample["image"],     # (3, H, W)
            "embeddings": aef_sample["image"],  # (64, H, W)
            "bbox": s2_sample["bbox"],
        }

# Usage
hybrid_ds = HybridDataset(
    s2_split.subset(split="train"),
    aef_split.subset(split="train"),
    geometries=bbox,
)

Best Practices

When combining datasets with different revisit times, consider:
  • Downsampling high-frequency sources to match low-frequency ones
  • Creating time windows and selecting the closest scene per window
  • Using date-based partitioning to balance temporal coverage
Different sensors have different native resolutions and grids:
  • Rasteret reprojects on-the-fly during get_numpy()/get_xarray()
  • For training, fix resolution= to a common GSD (e.g., 10m)
  • Use chip_size to ensure consistent spatial dimensions
Large multi-dataset pipelines can strain RAM:
  • Use rasteret.as_collection() with pyarrow.dataset.Dataset for lazy loading
  • Stream from disk with PyArrow partitioned datasets
  • Materialize only training split subsets during data loader construction