416 lines
19 KiB
Python

"""
This module provides an abstraction over geographical data sets which can be used to efficiently retrieve
properties for many nodes on a possibly irregular grid.
The implementation is currently single threaded but should be straightforward to parallelize.
Many commonly used datasets require a couple of gigabytes of memory, so make sure you do not open too many at
once.
"""
# Standard library
import math
# Typing
from typing import Any
from typing import cast
from typing import ContextManager
from typing import Optional
from typing import Tuple
from typing import Union
# Scientific
from numpy import allclose
from numpy import clip
from numpy import hstack
from numpy import linspace
from numpy import meshgrid
from numpy import ndarray
# Raster data set library
import rasterio
import rasterio.coords
import rasterio.io
import rasterio.windows
from rasterio.windows import Window
# Geographic helpers
from pyrate.plan.geometry.geospatial import MAXIMUM_EARTH_CIRCUMFERENCE
from pyrate.plan.geometry.helpers import meters2rad
class DataSetAccess(ContextManager["DataSetAccess"]):
"""Represents a global raster geo dataset that can be efficiently queried for a set of nodes.
See :meth:`~.get_bounding_windows_around` for why two bounding boxes/windows are supported by some
methods.
Notes:
The type of the data that is read depends on the dataset that is used.
Warning:
This class shall only be used as a context manager (using the `with`-syntax) in order to initialize
and clean up any resources that are required and possibly several gigabytes large. The behaviour of
this class is undefined after the context was left, as the internal data array is deleted to free up
memory. Also, the data is only guaranteed to be available once the context was entered.
Warning:
There are many subtle pitfalls with processing geographical datasets. They are only alleviated by
software abstractions like *GDAL*, *rasterio* and also *Pyrate* to some degree and are often badly
documented. For instance: All raster datasets define a grid (well, the *raster*). The creator(s) of
the dataset define whether the data at each entry refers to the center of the cell or the grid line of
the raster [1]. However, in a lot of software handling these datasets (like *GDAL* and *rasterio*),
it is not always clear what interpretation is used. *rasterio* seems to always assume grid-centered
values, and so does Pyrate. This could lead to small discrepancies between the data extracted from the
dataset and its real structure. On large-scale queries spanning many cells, this will not be a
problem.
While **many** libraries like *rasterio* exist for querying raster datasets at grid-like points or even
irregular points, none seems to allow for getting all raw data points of some datasets around some point
with some radius [2]. This was, however, required since we wanted to calculate statistics like "number of
points below sea level" for all data points around a given center point and radius. If we interpolated the
dataset to the center point, we would only get some local information and not cover the entire area within
the radius. The following libraries were investigated but did not provide the needed functionality:
- `GDALRasterBand tool from GDAL <https://gdal.org/api/gdalrasterband_cpp.html>`_
- `grdtrack tool from GMT <https://docs.generic-mapping-tools.org/latest/grdtrack.html>`_
or `its Python version <https://www.pygmt.org/latest/api/generated/pygmt.grdtrack.html>`_
- `geonum's topodata module <https://geonum.readthedocs.io/en/latest/api.html#module-geonum.topodata>`_
Args:
dataset: A :mod:`rasterio` raster dataset to read from or path to the file to open.
It must cover the entire planet with ``(0, 0)`` degrees being in the center.
raster_band_index: the index of band (the "layer") of the raster dataset (*GDAL*/*rasterio*
terminology)
Attributes:
dataset: the underlying :mod:`rasterio` dataset; read-only
raster_band_index: the index of band (the "layer") of the raster dataset (*GDAL*/*rasterio*
terminology); read-only
References:
- [1] This concept is also called grid- vs cell-registration. See also
`this Earth Science post <https://earthscience.stackexchange.com/q/4868/10692>`__.
- [2] Some `kind answers <https://lists.osgeo.org/pipermail/gdal-dev/2020-December/053125.html>`__
on the *gdal-dev* mailing list that did not help.
"""
#: The bounding box that any dataset wrapped by this class must match
_DATASET_BOUNDING_BOX = rasterio.coords.BoundingBox(left=-180.0, bottom=-90.0, right=180.0, top=90.0)
def __init__(self, dataset: Union[str, rasterio.DatasetReader], raster_band_index: int = 1) -> None:
self.dataset = rasterio.open(dataset) if isinstance(dataset, str) else dataset
self.raster_band_index = raster_band_index
self._data_array: ndarray
self._dataset_window = Window.from_slices(
rows=(0, self.dataset.height),
cols=(0, self.dataset.width),
)
assert allclose(DataSetAccess._DATASET_BOUNDING_BOX, self.dataset.bounds, atol=1e-12), (
"the dataset needs to cover the entire planet with (0, 0) degrees being in the center but was "
+ repr(self.dataset.bounds)
)
def get_bounding_windows_around(
self, center_latitude: float, center_longitude: float, radius: float
) -> Tuple[Window, Optional[Window]]:
"""Computes a bounding boxes/windows around the given center containing all points within the radius.
This method will return one bounding box per coordinate pair for *most* locations on earth.
However, near longitude +180°/-180° the dataset wraps around at the side, and as such, a bounding box
used for querying the dataset contains one or two such bounding boxes. Due to the internal design of
numpy, such a "wrapping" query cannot be expressed in a single slice [1] [2]. Thus, this method
might return one or two windows.
Correspondingly, :meth:`~.lat_lon_meshgrid_for` and :meth:`~.data_for` take one or two windows each.
Args:
center_latitude: The latitude of center of the box, in radians
center_longitude: The longitude of center of the box, in radians
radius: The radius around the center that the box should include. Strictly positive, in meters.
Returns:
One or two integer bounding windows as a tuple; might be slightly overestimated (by design)
References:
- [1] Some `answers <https://lists.osgeo.org/pipermail/gdal-dev/2020-December/053132.html>`_
on the *gdal-dev* mailing list that did not help
- [2] Numpy docs on
`Basic Slicing and Indexing
<https://numpy.org/doc/stable/reference/arrays.indexing.html#basic-slicing-and-indexing>`_
"""
# pylint: disable=too-many-locals
assert radius > 0.0, "radius must be strictly positive"
center_longitude = math.degrees(center_longitude)
center_latitude = math.degrees(center_latitude)
delta_lat = math.degrees(meters2rad(radius)) # uniform across the globe
assert delta_lat > 0.0, "is the input in radians?"
# Better slightly overestimate it by using the earth circumference at the equator
# That is about 1% larger than mean circumference
earth_circumference_at_lat = math.cos(math.radians(center_latitude)) * MAXIMUM_EARTH_CIRCUMFERENCE
delta_lon = radius / earth_circumference_at_lat * 360.0
assert delta_lon > 0.0, "is the input in radians?"
# Top & bottom are simple: just clip the latitude at the poles
# Left & right are a bit trickier since that possibly requires the creation of two slices
# These four coordinates determine the primary window
left = center_longitude - delta_lon
bottom = max(center_latitude - delta_lat, -90.0)
right = center_longitude + delta_lon
top = min(center_latitude + delta_lat, +90.0)
# `additional_window` determines the extra query if wrapping near longitude (+/-) 180° occurs
# `window` is geographically more west-ward than `additional_window`, if the latter exists
# Keep in mind though, that the common border might lie on +/- 180° longitude and thus on the
# border of the dataset/data array
window: Window
additional_window: Optional[Window]
# Handle the horizontal overflow of the window
# This also handles the case where 2*radius is larger than the width of the dataset
if left < -180.0: # Overflow on the left
overshoot = clip(-(left + 180), 0.0, 360.0)
left_wrapped = +180 - overshoot
# It might be the case that it also overflows on the right if the overall radius was so large
# that the window(s) would wrap around the world more than once. This can especially happen
# at high latitudes, where horizontally wrapping around the globe can happen at arbitrarily small
# radii/window sizes near the poles. We thus clip it to (in sum) only cover the world once.
right = clip(right, -180.0, left_wrapped)
# If the bounds overflow on the left, make the wrapped (i.e. the non-clipped) one the primary
# window, as it is geographically more west-ward
window = self.dataset.window(left_wrapped, bottom, +180.0, top)
additional_window = self.dataset.window(
-180.0, bottom, right, top
) # Possibly a window with zero width
elif right > +180.0: # Overflow on the right
overshoot = clip(right - 180, 0.0, 360.0)
right_wrapped = -180 + overshoot
# See the previous case "Overflow on the left" for an analogous explanation
left = clip(left, right_wrapped, +180.0)
# If the bounds overflow on the right, make the clipped (i.e. the non-wrapped) one the primary
# window, as it is geographically more west-ward
window = self.dataset.window(left, bottom, +180.0, top)
# `right_wrapped == -180` similar to above cannot occur here since then we must have landed in the
# `left < -180.0` branch instead
assert right_wrapped > -180, "The window would extend zero meters from east to west"
additional_window = self.dataset.window(-180.0, bottom, right_wrapped, top)
else: # No overflow at the bounds occurred, so we only need one `window`
window = self.dataset.window(left, bottom, right, top)
additional_window = None
# Jointly round the window(s) to integers and return the result
return self._round_windows_ceil(window, additional_window)
def _round_windows_ceil(
self, window_1: Window, window_2: Optional[Window]
) -> Tuple[Window, Optional[Window]]:
"""Rounds one or two windows to integer types and avoids an overlap to be created.
Always rounds to the larger windows if non-integer bounds are given. This guarantees that at least all
points initially given as the window(s) are also included in the resulting windows.
The actual rounding is done in :func:`rasterio.windows.window_index`.
Args:
window_1: The left window
window_2: An optional right window (geographically touches the eastern/right side of
``window_1``). Keep in mind though, that the common border might lie on +/- 180°
longitude and thus on the border of the dataset/data array.
Returns:
One or two windows, rounded to :class:`int` values.
Due to rounding, this method may return only a single window even if two were initially provided.
"""
(_, (_, w1_old_right)) = window_1.toranges()
# Round the first window
# The actual rounding is done in :func:`rasterio.windows.window_index`
window_1 = Window.from_slices(*cast(Tuple[slice, slice], rasterio.windows.window_index(window_1)))
# The rounding may move it beyond the bounds of the dataset, so clip it at the array borders
window_1 = window_1.intersection(self._dataset_window)
if window_2 is not None:
# Adjust `window_2` in the correct direction for a possibly created overlap
# Afterward, round it too
# Unpack the bounds that we will work with
((w1_top, w1_bottom), (_, w1_right)) = window_1.toranges()
(_, (left, right)) = window_2.toranges()
# Correct for the horizontal change that was induced by enlarging the `window_1`
# This will make `window_2` smaller if their common boundary was not already on a cell border
left += w1_right - w1_old_right
# Round away from the existing `window_1`, i.e. to the right/geographically west-ward
left = int(math.ceil(left))
right = int(math.ceil(right))
# There is a 1-cell overlap between the windows that was created by rounding, i.e.
# ``right == w1_left``. Therefore, we cut one index off.
right -= 1
# The case ``left == w1_right`` cannot occur since the second window is always guaranteed to be to
# the right of the first. We still check that though:
assert (
left - w1_right
) % self.dataset.width == 0, "this can never happen if the second window is truly to the right"
# Make sure that the extra window is non-empty and if not, just discard it
if right - left <= 0:
window_2 = None
else:
# We simply adopt the top and bottom bounds as they are the same in both windows
window_2 = Window.from_slices(rows=(w1_top, w1_bottom), cols=(left, right))
# May become obsolete if https://github.com/mapbox/rasterio/pull/2090 gets accepted
def to_int(win: Window) -> Window:
((float_top, float_bottom), (float_left, float_right)) = win.toranges()
return Window.from_slices(
rows=(int(float_top), int(float_bottom)), cols=(int(float_left), int(float_right))
)
return to_int(window_1), window_2 if window_2 is None else to_int(window_2)
def _lat_lon_meshgrid_single(self, window: Window, radians: bool) -> Tuple[ndarray, ndarray]:
"""Creates a meshgrid with all coordinates the data set has entries for in the given window.
Args:
window: as returned by :meth:`~get_bounding_windows_around`
radians: if ``True`` return in radians, else in degrees
Returns:
A latitude, longitude meshgrid matching the data returned by :meth:`~_data_single`
"""
# These values are in degrees
longitude_left, latitude_up = self.dataset.xy(window.row_off, window.col_off)
longitude_right, latitude_down = self.dataset.xy(
window.row_off + window.height, window.col_off + window.width
)
if radians:
longitude_left = math.radians(longitude_left)
latitude_up = math.radians(latitude_up)
longitude_right = math.radians(longitude_right)
latitude_down = math.radians(latitude_down)
coords_lat = linspace(latitude_up, latitude_down, window.height)
coords_lon = linspace(longitude_left, longitude_right, window.width)
coords_lat, coords_lon = meshgrid(coords_lat, coords_lon, indexing="ij")
assert coords_lat.shape == coords_lon.shape
return coords_lat, coords_lon
def lat_lon_meshgrid_for(
self,
window: Window,
additional_window: Optional[Window],
radians: bool,
) -> Tuple[ndarray, ndarray]:
"""Creates a meshgrid with all coordinates the data set has entries for in the given windows.
Args:
window: as returned by :meth:`~get_bounding_windows_around`
additional_window: as returned by :meth:`~get_bounding_windows_around`
radians: if ``True`` return in radians, else in degrees
Returns:
A single latitude, longitude meshgrid matching the data returned by :meth:`~data_for`
"""
coords_lat, coords_lon = self._lat_lon_meshgrid_single(window, radians)
# append additional window (only if required)
if additional_window is not None:
coords_lat_additional, coords_lon_additional = self._lat_lon_meshgrid_single(
additional_window, radians
)
coords_lat = hstack((coords_lat, coords_lat_additional))
coords_lon = hstack((coords_lon, coords_lon_additional))
return coords_lat, coords_lon
def _data_single(self, window: Window) -> ndarray:
"""Get all data points within the given window.
Notes:
The type of the data that is read depends on the dataset that is used.
See ``self.dataset.dtypes``.
Warnings:
Never modify the data returned by this method directly! It is only a view into the raw data.
Args:
window: A window as returned by :meth:`~get_bounding_windows_around`
Returns:
The 2D data array matching the coordinates returned by :meth:`~_lat_lon_meshgrid_single`
"""
assert hasattr(self, "_data_array"), "DataSetAccess must be used as a context manager and be open"
# one could read by this:
# self.dataset.read(self.raster_band_index, window=window)
# however, this does not map the file into memory and is thus about 10x slower than directly using a
# numpy array
# keep in mind that the slice will create a view into the raw data, and not a copy
# this is intentional to make the data access fast
data: ndarray = self._data_array[window.toslices()]
assert data.shape == (window.height, window.width)
return data
def data_for(self, window: Window, additional_window: Optional[Window]) -> ndarray:
"""Get all data points within the given windows as a single array.
Notes:
The type of the data that is read depends on the dataset that is used.
See ``self.dataset.dtypes``.
Warnings:
Never modify the data returned by this method directly! It is only a view into the raw data.
Args:
window: as returned by :meth:`~get_bounding_windows_around`
additional_window: as returned by :meth:`~get_bounding_windows_around`
Returns:
The single 2D data array matching the coordinates returned by :meth:`~lat_lon_meshgrid_for`
"""
result = self._data_single(window)
# append additional window (only if required)
if additional_window is not None:
additional_result = self._data_single(additional_window)
result = hstack((result, additional_result))
return result
def __enter__(self) -> "DataSetAccess":
self.dataset.__enter__()
self._data_array = self.dataset.read(self.raster_band_index)
self._data_array.flags.writeable = False # make this read-only to prevent accidents
return self
def __exit__(self, exc_type, exc_val, exc_tb) -> Any:
del self._data_array
return self.dataset.__exit__(exc_type, exc_val, exc_tb)