490 lines
9.2 KiB
Markdown
490 lines
9.2 KiB
Markdown
# Table Operations (astropy.table)
|
|
|
|
The `astropy.table` module provides flexible tools for working with tabular data, with support for units, masked values, and various file formats.
|
|
|
|
## Creating Tables
|
|
|
|
### Basic Table Creation
|
|
|
|
```python
|
|
from astropy.table import Table, QTable
|
|
import astropy.units as u
|
|
import numpy as np
|
|
|
|
# From column arrays
|
|
a = [1, 4, 5]
|
|
b = [2.0, 5.0, 8.2]
|
|
c = ['x', 'y', 'z']
|
|
|
|
t = Table([a, b, c], names=('id', 'flux', 'name'))
|
|
|
|
# With units (use QTable)
|
|
flux = [1.2, 2.3, 3.4] * u.Jy
|
|
wavelength = [500, 600, 700] * u.nm
|
|
t = QTable([flux, wavelength], names=('flux', 'wavelength'))
|
|
```
|
|
|
|
### From Lists of Rows
|
|
|
|
```python
|
|
# List of tuples
|
|
rows = [(1, 10.5, 'A'), (2, 11.2, 'B'), (3, 12.3, 'C')]
|
|
t = Table(rows=rows, names=('id', 'value', 'name'))
|
|
|
|
# List of dictionaries
|
|
rows = [{'id': 1, 'value': 10.5}, {'id': 2, 'value': 11.2}]
|
|
t = Table(rows)
|
|
```
|
|
|
|
### From NumPy Arrays
|
|
|
|
```python
|
|
# Structured array
|
|
arr = np.array([(1, 2.0, 'x'), (4, 5.0, 'y')],
|
|
dtype=[('a', 'i4'), ('b', 'f8'), ('c', 'U10')])
|
|
t = Table(arr)
|
|
|
|
# 2D array with column names
|
|
data = np.random.random((100, 3))
|
|
t = Table(data, names=['col1', 'col2', 'col3'])
|
|
```
|
|
|
|
### From Pandas DataFrame
|
|
|
|
```python
|
|
import pandas as pd
|
|
|
|
df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]})
|
|
t = Table.from_pandas(df)
|
|
```
|
|
|
|
## Accessing Table Data
|
|
|
|
### Basic Access
|
|
|
|
```python
|
|
# Column access
|
|
ra_col = t['ra'] # Returns Column object
|
|
dec_col = t['dec']
|
|
|
|
# Row access
|
|
first_row = t[0] # Returns Row object
|
|
row_slice = t[10:20] # Returns new Table
|
|
|
|
# Cell access
|
|
value = t['ra'][5] # 6th value in 'ra' column
|
|
value = t[5]['ra'] # Same thing
|
|
|
|
# Multiple columns
|
|
subset = t['ra', 'dec', 'mag']
|
|
```
|
|
|
|
### Table Properties
|
|
|
|
```python
|
|
len(t) # Number of rows
|
|
t.colnames # List of column names
|
|
t.dtype # Column data types
|
|
t.info # Detailed information
|
|
t.meta # Metadata dictionary
|
|
```
|
|
|
|
### Iteration
|
|
|
|
```python
|
|
# Iterate over rows
|
|
for row in t:
|
|
print(row['ra'], row['dec'])
|
|
|
|
# Iterate over columns
|
|
for colname in t.colnames:
|
|
print(t[colname])
|
|
```
|
|
|
|
## Modifying Tables
|
|
|
|
### Adding Columns
|
|
|
|
```python
|
|
# Add new column
|
|
t['new_col'] = [1, 2, 3, 4, 5]
|
|
t['calc'] = t['a'] + t['b'] # Calculated column
|
|
|
|
# Add column with units
|
|
t['velocity'] = [10, 20, 30] * u.km / u.s
|
|
|
|
# Add empty column
|
|
from astropy.table import Column
|
|
t['empty'] = Column(length=len(t), dtype=float)
|
|
|
|
# Insert at specific position
|
|
t.add_column([7, 8, 9], name='inserted', index=2)
|
|
```
|
|
|
|
### Removing Columns
|
|
|
|
```python
|
|
# Remove single column
|
|
t.remove_column('old_col')
|
|
|
|
# Remove multiple columns
|
|
t.remove_columns(['col1', 'col2'])
|
|
|
|
# Delete syntax
|
|
del t['col_name']
|
|
|
|
# Keep only specific columns
|
|
t.keep_columns(['ra', 'dec', 'mag'])
|
|
```
|
|
|
|
### Renaming Columns
|
|
|
|
```python
|
|
t.rename_column('old_name', 'new_name')
|
|
|
|
# Rename multiple
|
|
t.rename_columns(['old1', 'old2'], ['new1', 'new2'])
|
|
```
|
|
|
|
### Adding Rows
|
|
|
|
```python
|
|
# Add single row
|
|
t.add_row([1, 2.5, 'new'])
|
|
|
|
# Add row as dict
|
|
t.add_row({'ra': 10.5, 'dec': 41.2, 'mag': 18.5})
|
|
|
|
# Note: Adding rows one at a time is slow!
|
|
# Better to collect rows and create table at once
|
|
```
|
|
|
|
### Modifying Data
|
|
|
|
```python
|
|
# Modify column values
|
|
t['flux'] = t['flux'] * gain
|
|
t['mag'][t['mag'] < 0] = np.nan
|
|
|
|
# Modify single cell
|
|
t['ra'][5] = 10.5
|
|
|
|
# Modify entire row
|
|
t[0] = [new_id, new_ra, new_dec]
|
|
```
|
|
|
|
## Sorting and Filtering
|
|
|
|
### Sorting
|
|
|
|
```python
|
|
# Sort by single column
|
|
t.sort('mag')
|
|
|
|
# Sort descending
|
|
t.sort('mag', reverse=True)
|
|
|
|
# Sort by multiple columns
|
|
t.sort(['priority', 'mag'])
|
|
|
|
# Get sorted indices without modifying table
|
|
indices = t.argsort('mag')
|
|
sorted_table = t[indices]
|
|
```
|
|
|
|
### Filtering
|
|
|
|
```python
|
|
# Boolean indexing
|
|
bright = t[t['mag'] < 18]
|
|
nearby = t[t['distance'] < 100*u.pc]
|
|
|
|
# Multiple conditions
|
|
selected = t[(t['mag'] < 18) & (t['dec'] > 0)]
|
|
|
|
# Using numpy functions
|
|
high_snr = t[np.abs(t['flux'] / t['error']) > 5]
|
|
```
|
|
|
|
## Reading and Writing Files
|
|
|
|
### Supported Formats
|
|
|
|
FITS, HDF5, ASCII (CSV, ECSV, IPAC, etc.), VOTable, Parquet, ASDF
|
|
|
|
### Reading Files
|
|
|
|
```python
|
|
# Automatic format detection
|
|
t = Table.read('catalog.fits')
|
|
t = Table.read('data.csv')
|
|
t = Table.read('table.vot')
|
|
|
|
# Specify format explicitly
|
|
t = Table.read('data.txt', format='ascii')
|
|
t = Table.read('catalog.hdf5', path='/dataset/table')
|
|
|
|
# Read specific HDU from FITS
|
|
t = Table.read('file.fits', hdu=2)
|
|
```
|
|
|
|
### Writing Files
|
|
|
|
```python
|
|
# Automatic format from extension
|
|
t.write('output.fits')
|
|
t.write('output.csv')
|
|
|
|
# Specify format
|
|
t.write('output.txt', format='ascii.csv')
|
|
t.write('output.hdf5', path='/data/table', serialize_meta=True)
|
|
|
|
# Overwrite existing file
|
|
t.write('output.fits', overwrite=True)
|
|
```
|
|
|
|
### ASCII Format Options
|
|
|
|
```python
|
|
# CSV with custom delimiter
|
|
t.write('output.csv', format='ascii.csv', delimiter='|')
|
|
|
|
# Fixed-width format
|
|
t.write('output.txt', format='ascii.fixed_width')
|
|
|
|
# IPAC format
|
|
t.write('output.tbl', format='ascii.ipac')
|
|
|
|
# LaTeX table
|
|
t.write('table.tex', format='ascii.latex')
|
|
```
|
|
|
|
## Table Operations
|
|
|
|
### Stacking Tables (Vertical)
|
|
|
|
```python
|
|
from astropy.table import vstack
|
|
|
|
# Concatenate tables vertically
|
|
t1 = Table([[1, 2], [3, 4]], names=('a', 'b'))
|
|
t2 = Table([[5, 6], [7, 8]], names=('a', 'b'))
|
|
t_combined = vstack([t1, t2])
|
|
```
|
|
|
|
### Joining Tables (Horizontal)
|
|
|
|
```python
|
|
from astropy.table import hstack
|
|
|
|
# Concatenate tables horizontally
|
|
t1 = Table([[1, 2]], names=['a'])
|
|
t2 = Table([[3, 4]], names=['b'])
|
|
t_combined = hstack([t1, t2])
|
|
```
|
|
|
|
### Database-Style Joins
|
|
|
|
```python
|
|
from astropy.table import join
|
|
|
|
# Inner join on common column
|
|
t1 = Table([[1, 2, 3], ['a', 'b', 'c']], names=('id', 'data1'))
|
|
t2 = Table([[1, 2, 4], ['x', 'y', 'z']], names=('id', 'data2'))
|
|
t_joined = join(t1, t2, keys='id')
|
|
|
|
# Left/right/outer joins
|
|
t_joined = join(t1, t2, join_type='left')
|
|
t_joined = join(t1, t2, join_type='outer')
|
|
```
|
|
|
|
### Grouping and Aggregating
|
|
|
|
```python
|
|
# Group by column
|
|
g = t.group_by('filter')
|
|
|
|
# Aggregate groups
|
|
means = g.groups.aggregate(np.mean)
|
|
|
|
# Iterate over groups
|
|
for group in g.groups:
|
|
print(f"Filter: {group['filter'][0]}")
|
|
print(f"Mean mag: {np.mean(group['mag'])}")
|
|
```
|
|
|
|
### Unique Rows
|
|
|
|
```python
|
|
# Get unique rows
|
|
t_unique = t.unique('id')
|
|
|
|
# Multiple columns
|
|
t_unique = t.unique(['ra', 'dec'])
|
|
```
|
|
|
|
## Units and Quantities
|
|
|
|
Use QTable for unit-aware operations:
|
|
|
|
```python
|
|
from astropy.table import QTable
|
|
|
|
# Create table with units
|
|
t = QTable()
|
|
t['flux'] = [1.2, 2.3, 3.4] * u.Jy
|
|
t['wavelength'] = [500, 600, 700] * u.nm
|
|
|
|
# Unit conversions
|
|
t['flux'].to(u.mJy)
|
|
t['wavelength'].to(u.angstrom)
|
|
|
|
# Calculations preserve units
|
|
t['freq'] = t['wavelength'].to(u.Hz, equivalencies=u.spectral())
|
|
```
|
|
|
|
## Masking Missing Data
|
|
|
|
```python
|
|
from astropy.table import MaskedColumn
|
|
import numpy as np
|
|
|
|
# Create masked column
|
|
flux = MaskedColumn([1.2, np.nan, 3.4], mask=[False, True, False])
|
|
t = Table([flux], names=['flux'])
|
|
|
|
# Operations automatically handle masks
|
|
mean_flux = np.ma.mean(t['flux'])
|
|
|
|
# Fill masked values
|
|
t['flux'].filled(0) # Replace masked with 0
|
|
```
|
|
|
|
## Indexing for Fast Lookup
|
|
|
|
Create indices for fast row retrieval:
|
|
|
|
```python
|
|
# Add index on column
|
|
t.add_index('id')
|
|
|
|
# Fast lookup by index
|
|
row = t.loc[12345] # Find row where id=12345
|
|
|
|
# Range queries
|
|
subset = t.loc[100:200]
|
|
```
|
|
|
|
## Table Metadata
|
|
|
|
```python
|
|
# Set table-level metadata
|
|
t.meta['TELESCOPE'] = 'HST'
|
|
t.meta['FILTER'] = 'F814W'
|
|
t.meta['EXPTIME'] = 300.0
|
|
|
|
# Set column-level metadata
|
|
t['ra'].meta['unit'] = 'deg'
|
|
t['ra'].meta['description'] = 'Right Ascension'
|
|
t['ra'].description = 'Right Ascension' # Shortcut
|
|
```
|
|
|
|
## Performance Tips
|
|
|
|
### Fast Table Construction
|
|
|
|
```python
|
|
# SLOW: Adding rows one at a time
|
|
t = Table(names=['a', 'b'])
|
|
for i in range(1000):
|
|
t.add_row([i, i**2])
|
|
|
|
# FAST: Build from lists
|
|
rows = [(i, i**2) for i in range(1000)]
|
|
t = Table(rows=rows, names=['a', 'b'])
|
|
```
|
|
|
|
### Memory-Mapped FITS Tables
|
|
|
|
```python
|
|
# Don't load entire table into memory
|
|
t = Table.read('huge_catalog.fits', memmap=True)
|
|
|
|
# Only loads data when accessed
|
|
subset = t[10000:10100] # Efficient
|
|
```
|
|
|
|
### Copy vs. View
|
|
|
|
```python
|
|
# Create view (shares data, fast)
|
|
t_view = t['ra', 'dec']
|
|
|
|
# Create copy (independent data)
|
|
t_copy = t['ra', 'dec'].copy()
|
|
```
|
|
|
|
## Displaying Tables
|
|
|
|
```python
|
|
# Print to console
|
|
print(t)
|
|
|
|
# Show in interactive browser
|
|
t.show_in_browser()
|
|
t.show_in_browser(jsviewer=True) # Interactive sorting/filtering
|
|
|
|
# Paginated viewing
|
|
t.more()
|
|
|
|
# Custom formatting
|
|
t['flux'].format = '%.3f'
|
|
t['ra'].format = '{:.6f}'
|
|
```
|
|
|
|
## Converting to Other Formats
|
|
|
|
```python
|
|
# To NumPy array
|
|
arr = np.array(t)
|
|
|
|
# To Pandas DataFrame
|
|
df = t.to_pandas()
|
|
|
|
# To dictionary
|
|
d = {name: t[name] for name in t.colnames}
|
|
```
|
|
|
|
## Common Use Cases
|
|
|
|
### Cross-Matching Catalogs
|
|
|
|
```python
|
|
from astropy.coordinates import SkyCoord, match_coordinates_sky
|
|
|
|
# Create coordinate objects from table columns
|
|
coords1 = SkyCoord(t1['ra'], t1['dec'], unit='deg')
|
|
coords2 = SkyCoord(t2['ra'], t2['dec'], unit='deg')
|
|
|
|
# Find matches
|
|
idx, sep, _ = coords1.match_to_catalog_sky(coords2)
|
|
|
|
# Filter by separation
|
|
max_sep = 1 * u.arcsec
|
|
matches = sep < max_sep
|
|
t1_matched = t1[matches]
|
|
t2_matched = t2[idx[matches]]
|
|
```
|
|
|
|
### Binning Data
|
|
|
|
```python
|
|
from astropy.table import Table
|
|
import numpy as np
|
|
|
|
# Bin by magnitude
|
|
mag_bins = np.arange(10, 20, 0.5)
|
|
binned = t.group_by(np.digitize(t['mag'], mag_bins))
|
|
counts = binned.groups.aggregate(len)
|
|
```
|