Source code for lsdb.loaders.dataframe.from_astropy

import numpy as np
import pyarrow as pa
from nested_pandas.nestedframe.io import from_pyarrow

from lsdb.loaders.dataframe.from_dataframe import from_dataframe


# pylint: disable=too-many-arguments
[docs] def from_astropy( table, *, ra_column: str | None = None, dec_column: str | None = None, lowest_order: int = 0, highest_order: int = 7, drop_empty_siblings: bool = True, partition_rows: int | None = None, partition_bytes: int | None = None, margin_order: int = -1, margin_threshold: float | None = 5.0, should_generate_moc: bool = True, moc_max_order: int = 10, use_pyarrow_types: bool = True, schema=None, flatten_tensors: bool = False, **kwargs, ): """Load a catalog from an Astropy Table. Note that this is only suitable for small datasets (< 1million rows and < 1GB dataframe in-memory). If you need to deal with large datasets, consider using the hats-import package: https://hats-import.readthedocs.io/ Parameters ---------- table : astropy.table.Table The Astropy Table (or QTable). ra_column : str, optional The name of the right ascension column. By default, case-insensitive versions of 'ra' are detected. dec_column : str, optional The name of the declination column. By default, case-insensitive versions of 'dec' are detected. lowest_order : int, default 0 The lowest partition order. Defaults to 0. highest_order : int, default 7 The highest partition order. Defaults to 7. drop_empty_siblings : bool, default True When determining final partitionining, if 3 of 4 pixels are empty, keep only the non-empty pixel partition_rows : int or None, default None The desired partition size, in number of rows. Only one of `partition_rows` or `partition_bytes` should be specified. Note: partitioning is spatial (HEALPix-based). `partition_rows` is a best-effort target, and the resulting number of partitions is limited by `highest_order` and the sky footprint of your data. partition_bytes : int or None, default None The desired partition size, in bytes. Only one of `partition_rows` or `partition_bytes` should be specified. Note: as with `partition_rows`, this is a best-effort target for spatial (HEALPix-based) partitioning and is limited by `highest_order`. margin_order : int, default -1 The order at which to generate the margin cache. margin_threshold : float or None, default 5 The threshold (in arcseconds) for including sources in the margin cache. If None, and margin_order is specified, the margin cache will include all sources in the margin pixels. should_generate_moc : bool, default True If True, generates a MOC for the catalog. moc_max_order : int, default 10 The maximum order to use when generating the MOC. use_pyarrow_types : bool, default True If True, uses PyArrow backed types in the resulting catalog. schema : pa.Schema or None, default None The arrow schema to create the catalog with. If None, the schema is automatically inferred from the DataFrame conversion of the table using `pa.Schema.from_pandas`. flatten_tensors : bool, default False If True, flattens multidimensional columns to 2D arrays in the resulting catalog. **kwargs Additional arguments to pass along to LSDB.from_dataframe. Returns ------- Catalog The loaded catalog. Examples -------- >>> from astropy.table import Table >>> import lsdb >>> data = { ... "ra": [10.0, 20.0, 30.0], ... "dec": [-10.0, -20.0, -30.0], ... "magnitude": [15.0, 16.5, 14.2], ... } >>> table = Table(data) >>> catalog = lsdb.from_astropy(table, ra_column="ra", dec_column="dec") >>> catalog.head() # doctest: +NORMALIZE_WHITESPACE ra dec magnitude _healpix_29 1212933045629049957 10.0 -10.0 15.0 1176808107119886823 20.0 -20.0 16.5 2510306432296314470 30.0 -30.0 14.2 """ # Go through pyarrow to convert the table to a dataframe. # Don't use table.to_pandas() as that would lose multidimensional column support arrow_table = _astropy_to_pyarrow_table(table, flatten_tensors=flatten_tensors) dataframe = from_pyarrow(arrow_table) return from_dataframe( dataframe, ra_column=ra_column, dec_column=dec_column, lowest_order=lowest_order, highest_order=highest_order, drop_empty_siblings=drop_empty_siblings, partition_rows=partition_rows, partition_bytes=partition_bytes, margin_order=margin_order, margin_threshold=margin_threshold, should_generate_moc=should_generate_moc, moc_max_order=moc_max_order, use_pyarrow_types=use_pyarrow_types, schema=schema, **kwargs, )
# TODO: Code pulled from hats-import, potentially should move to hats # In which case, remove this and use the hats version directly # https://github.com/astronomy-commons/hats-import/issues/623 def _np_to_pyarrow_array(array: np.ndarray, *, flatten_tensors: bool) -> pa.Array: """Convert a numpy array to a pyarrow""" # We usually have the "wrong" byte order from FITS array = np.asanyarray(array, dtype=array.dtype.newbyteorder("=")) values = pa.array(array.reshape(-1)) # "Base" type if array.ndim == 1: return values # Flat multidimensional nested values if asked if flatten_tensors and array.ndim > 2: array = array.reshape(array.shape[0], -1) pa_list_array = pa.FixedSizeListArray.from_arrays(values, np.prod(array.shape[1:])) # An extra dimension is represented as a list array if array.ndim == 2: return pa_list_array # array.ndim > 2 # Multiple extra dimensions are represented as a tensor array tensor_type = pa.fixed_shape_tensor(values.type, shape=array.shape[1:]) return pa.FixedShapeTensorArray.from_storage(tensor_type, pa_list_array) def _astropy_to_pyarrow_table(astropy_table, *, flatten_tensors: bool) -> pa.Table: """Convert astropy.table.Table to pyarrow.Table""" pa_arrays = {} for column in astropy_table.columns: np_array = np.asarray(astropy_table[column]) pa_arrays[column] = _np_to_pyarrow_array(np_array, flatten_tensors=flatten_tensors) return pa.table(pa_arrays)