Source code for lsdb.loaders.dataframe.from_dataframe

from __future__ import annotations

import pandas as pd
import pyarrow as pa

from lsdb.catalog import Catalog
from lsdb.loaders.dataframe.dataframe_catalog_loader import DataframeCatalogLoader
from lsdb.loaders.dataframe.margin_catalog_generator import MarginCatalogGenerator


# pylint: disable=too-many-arguments

[docs]
def from_dataframe(
    dataframe: pd.DataFrame,
    *,
    ra_column: str | None = None,
    dec_column: str | None = None,
    lowest_order: int = 0,
    highest_order: int = 7,
    drop_empty_siblings: bool = True,
    partition_rows: int | None = None,
    partition_bytes: int | None = None,
    margin_order: int = -1,
    margin_threshold: float | None = 5.0,
    should_generate_moc: bool = True,
    moc_max_order: int = 10,
    use_pyarrow_types: bool = True,
    schema: pa.Schema | None = None,
    **kwargs,
) -> Catalog:
    """Load a catalog from a Pandas Dataframe.

    Note that this is only suitable for small datasets (< 1million rows and
    < 1GB dataframe in-memory). If you need to deal with large datasets, consider
    using the hats-import package: https://hats-import.readthedocs.io/

    Parameters
    ----------
    dataframe : pd.Dataframe
        The catalog Pandas Dataframe.
    ra_column : str, optional
        The name of the right ascension column. By default,
        case-insensitive versions of 'ra' are detected.
    dec_column : str, optional
        The name of the declination column. By default,
        case-insensitive versions of 'dec' are detected.
    lowest_order : int, default 0
        The lowest partition order. Defaults to 0.
    highest_order : int, default 7
        The highest partition order. Defaults to 7.
    drop_empty_siblings : bool, default True
        When determining final partitionining, if 3 of 4 pixels are empty,
        keep only the non-empty pixel
    partition_rows : int or None, default None
        The desired partition size, in number of rows. Only one of
        `partition_rows` or `partition_bytes` should be specified.

        Note: partitioning is spatial (HEALPix-based). `partition_rows` is a best-effort target,
        and the resulting number of partitions is limited by `highest_order` and the sky footprint
        of your data (e.g., if all rows fall into a single HEALPix pixel at `highest_order`, you will
        still get a single partition).
    partition_bytes : int or None, default None
        The desired partition size, in bytes. Only one of
        `partition_rows` or `partition_bytes` should be specified.

        Note: as with `partition_rows`, this is a best-effort target for spatial (HEALPix-based)
        partitioning and is limited by `highest_order`.
    margin_order : int, default -1
        The order at which to generate the margin cache.
    margin_threshold : float or None, default 5
        The size of the margin cache boundary, in arcseconds. If None, and
        margin order is not specified, the margin cache is not generated. Defaults to 5 arcseconds.
    should_generate_moc : bool, default True
        Should we generate a MOC (multi-order coverage map) of the data.
        It can improve performance when joining/crossmatching to other hats-sharded datasets.
    moc_max_order : int, default 10
        if generating a MOC, what to use as the max order.
    use_pyarrow_types : bool, default True
        If True, the data is backed by pyarrow, otherwise we keep the
        original data types.
    schema : pa.Schema or None
        the arrow schema to create the catalog with. If None, the schema is
        automatically inferred from the provided DataFrame using `pa.Schema.from_pandas`.
    **kwargs :
        Arguments to pass to the creation of the catalog info.

    Returns
    -------
    Catalog
        Catalog object loaded from the given parameters

    Raises
    ------
    ValueError
        If RA/Dec columns are not found or contain NaN values.

    Examples
    --------
    Create a small, synthetic sky catalog and load it into LSDB:

    >>> import lsdb
    >>> from lsdb.nested.datasets import generate_data
    >>> nf = generate_data(1000, 5, seed=0, ra_range=(0.0, 300.0), dec_range=(-50.0, 50.0))
    >>> df = nf.compute()[["ra", "dec", "id"]]
    >>> catalog = lsdb.from_dataframe(df, catalog_name="toy_catalog")
    >>> catalog.head()  # doctest: +NORMALIZE_WHITESPACE
                            ra        dec    id
    _healpix_29
    118362963675428450  52.696686  39.675892  8154
    98504457942331510   89.913567  46.147079  3437
    70433374600953220   40.528952  35.350965  8214
    154968715224527848   17.57041    29.8936  9853
    67780378363846894    45.08384   31.95611  8297
    """
    # Load the catalog.
    catalog = DataframeCatalogLoader(
        dataframe,
        ra_column=ra_column,
        dec_column=dec_column,
        lowest_order=lowest_order,
        highest_order=highest_order,
        drop_empty_siblings=drop_empty_siblings,
        partition_rows=partition_rows,
        partition_bytes=partition_bytes,
        should_generate_moc=should_generate_moc,
        moc_max_order=moc_max_order,
        use_pyarrow_types=use_pyarrow_types,
        schema=schema,
        **kwargs,
    ).load_catalog()
    catalog.margin = MarginCatalogGenerator(
        catalog,
        margin_order,
        margin_threshold,
        use_pyarrow_types,
        **kwargs,
    ).create_catalog()
    return catalog