Source code for lsdb.loaders.dataframe.from_astropy

import numpy as np
import pyarrow as pa
from nested_pandas.nestedframe.io import from_pyarrow

from lsdb.loaders.dataframe.from_dataframe import from_dataframe


# pylint: disable=too-many-arguments

[docs]
def from_astropy(
    table,
    *,
    ra_column: str | None = None,
    dec_column: str | None = None,
    lowest_order: int = 0,
    highest_order: int = 7,
    drop_empty_siblings: bool = True,
    partition_rows: int | None = None,
    partition_bytes: int | None = None,
    margin_order: int = -1,
    margin_threshold: float | None = 5.0,
    should_generate_moc: bool = True,
    moc_max_order: int = 10,
    use_pyarrow_types: bool = True,
    schema=None,
    flatten_tensors: bool = False,
    **kwargs,
):
    """Load a catalog from an Astropy Table.

    Note that this is only suitable for small datasets (< 1million rows and
    < 1GB dataframe in-memory). If you need to deal with large datasets, consider
    using the hats-import package: https://hats-import.readthedocs.io/

    Parameters
    ----------
    table : astropy.table.Table
        The Astropy Table (or QTable).
    ra_column : str, optional
        The name of the right ascension column. By default,
        case-insensitive versions of 'ra' are detected.
    dec_column : str, optional
        The name of the declination column. By default,
        case-insensitive versions of 'dec' are detected.
    lowest_order : int, default 0
        The lowest partition order. Defaults to 0.
    highest_order : int, default 7
        The highest partition order. Defaults to 7.
    drop_empty_siblings : bool, default True
        When determining final partitionining, if 3 of 4 pixels are empty,
        keep only the non-empty pixel
    partition_rows : int or None, default None
        The desired partition size, in number of rows. Only one of
        `partition_rows` or `partition_bytes` should be specified.

        Note: partitioning is spatial (HEALPix-based). `partition_rows` is a best-effort target,
        and the resulting number of partitions is limited by `highest_order` and the sky footprint
        of your data.
    partition_bytes : int or None, default None
        The desired partition size, in bytes. Only one of
        `partition_rows` or `partition_bytes` should be specified.

        Note: as with `partition_rows`, this is a best-effort target for spatial (HEALPix-based)
        partitioning and is limited by `highest_order`.
    margin_order : int, default -1
        The order at which to generate the margin cache.
    margin_threshold : float or None, default 5
        The threshold (in arcseconds) for including sources in the margin cache. If None, and
        margin_order is specified, the margin cache will include all sources in the margin pixels.
    should_generate_moc : bool, default True
        If True, generates a MOC for the catalog.
    moc_max_order : int, default 10
        The maximum order to use when generating the MOC.
    use_pyarrow_types : bool, default True
        If True, uses PyArrow backed types in the resulting catalog.
    schema : pa.Schema or None, default None
        The arrow schema to create the catalog with. If None, the schema is
        automatically inferred from the DataFrame conversion of the table
        using `pa.Schema.from_pandas`.
    flatten_tensors : bool, default False
        If True, flattens multidimensional columns to 2D arrays in the
        resulting catalog.
    **kwargs
        Additional arguments to pass along to LSDB.from_dataframe.

    Returns
    -------
    Catalog
        The loaded catalog.

    Examples
    --------
    >>> from astropy.table import Table
    >>> import lsdb
    >>> data = {
    ...     "ra": [10.0, 20.0, 30.0],
    ...     "dec": [-10.0, -20.0, -30.0],
    ...     "magnitude": [15.0, 16.5, 14.2],
    ... }
    >>> table = Table(data)
    >>> catalog = lsdb.from_astropy(table, ra_column="ra", dec_column="dec")
    >>> catalog.head()  # doctest: +NORMALIZE_WHITESPACE
                           ra   dec  magnitude
    _healpix_29
    1212933045629049957  10.0 -10.0       15.0
    1176808107119886823  20.0 -20.0       16.5
    2510306432296314470  30.0 -30.0       14.2
    """
    # Go through pyarrow to convert the table to a dataframe.
    # Don't use table.to_pandas() as that would lose multidimensional column support
    arrow_table = _astropy_to_pyarrow_table(table, flatten_tensors=flatten_tensors)
    dataframe = from_pyarrow(arrow_table)

    return from_dataframe(
        dataframe,
        ra_column=ra_column,
        dec_column=dec_column,
        lowest_order=lowest_order,
        highest_order=highest_order,
        drop_empty_siblings=drop_empty_siblings,
        partition_rows=partition_rows,
        partition_bytes=partition_bytes,
        margin_order=margin_order,
        margin_threshold=margin_threshold,
        should_generate_moc=should_generate_moc,
        moc_max_order=moc_max_order,
        use_pyarrow_types=use_pyarrow_types,
        schema=schema,
        **kwargs,
    )



# TODO: Code pulled from hats-import, potentially should move to hats
# In which case, remove this and use the hats version directly
# https://github.com/astronomy-commons/hats-import/issues/623
def _np_to_pyarrow_array(array: np.ndarray, *, flatten_tensors: bool) -> pa.Array:
    """Convert a numpy array to a pyarrow"""
    # We usually have the "wrong" byte order from FITS
    array = np.asanyarray(array, dtype=array.dtype.newbyteorder("="))
    values = pa.array(array.reshape(-1))
    # "Base" type
    if array.ndim == 1:
        return values
    # Flat multidimensional nested values if asked
    if flatten_tensors and array.ndim > 2:
        array = array.reshape(array.shape[0], -1)
    pa_list_array = pa.FixedSizeListArray.from_arrays(values, np.prod(array.shape[1:]))
    # An extra dimension is represented as a list array
    if array.ndim == 2:
        return pa_list_array
    # array.ndim > 2
    # Multiple extra dimensions are represented as a tensor array
    tensor_type = pa.fixed_shape_tensor(values.type, shape=array.shape[1:])
    return pa.FixedShapeTensorArray.from_storage(tensor_type, pa_list_array)


def _astropy_to_pyarrow_table(astropy_table, *, flatten_tensors: bool) -> pa.Table:
    """Convert astropy.table.Table to pyarrow.Table"""
    pa_arrays = {}
    for column in astropy_table.columns:
        np_array = np.asarray(astropy_table[column])
        pa_arrays[column] = _np_to_pyarrow_array(np_array, flatten_tensors=flatten_tensors)
    return pa.table(pa_arrays)