416 lines
19 KiB
Python
416 lines
19 KiB
Python
"""
|
|
This module provides an abstraction over geographical data sets which can be used to efficiently retrieve
|
|
properties for many nodes on a possibly irregular grid.
|
|
|
|
The implementation is currently single threaded but should be straightforward to parallelize.
|
|
Many commonly used datasets require a couple of gigabytes of memory, so make sure you do not open too many at
|
|
once.
|
|
"""
|
|
|
|
# Standard library
|
|
import math
|
|
|
|
# Typing
|
|
from typing import Any
|
|
from typing import cast
|
|
from typing import ContextManager
|
|
from typing import Optional
|
|
from typing import Tuple
|
|
from typing import Union
|
|
|
|
# Scientific
|
|
from numpy import allclose
|
|
from numpy import clip
|
|
from numpy import hstack
|
|
from numpy import linspace
|
|
from numpy import meshgrid
|
|
from numpy import ndarray
|
|
|
|
# Raster data set library
|
|
import rasterio
|
|
import rasterio.coords
|
|
import rasterio.io
|
|
import rasterio.windows
|
|
from rasterio.windows import Window
|
|
|
|
# Geographic helpers
|
|
from pyrate.plan.geometry.geospatial import MAXIMUM_EARTH_CIRCUMFERENCE
|
|
from pyrate.plan.geometry.helpers import meters2rad
|
|
|
|
|
|
class DataSetAccess(ContextManager["DataSetAccess"]):
|
|
"""Represents a global raster geo dataset that can be efficiently queried for a set of nodes.
|
|
|
|
See :meth:`~.get_bounding_windows_around` for why two bounding boxes/windows are supported by some
|
|
methods.
|
|
|
|
Notes:
|
|
The type of the data that is read depends on the dataset that is used.
|
|
|
|
Warning:
|
|
This class shall only be used as a context manager (using the `with`-syntax) in order to initialize
|
|
and clean up any resources that are required and possibly several gigabytes large. The behaviour of
|
|
this class is undefined after the context was left, as the internal data array is deleted to free up
|
|
memory. Also, the data is only guaranteed to be available once the context was entered.
|
|
|
|
Warning:
|
|
There are many subtle pitfalls with processing geographical datasets. They are only alleviated by
|
|
software abstractions like *GDAL*, *rasterio* and also *Pyrate* to some degree and are often badly
|
|
documented. For instance: All raster datasets define a grid (well, the *raster*). The creator(s) of
|
|
the dataset define whether the data at each entry refers to the center of the cell or the grid line of
|
|
the raster [1]. However, in a lot of software handling these datasets (like *GDAL* and *rasterio*),
|
|
it is not always clear what interpretation is used. *rasterio* seems to always assume grid-centered
|
|
values, and so does Pyrate. This could lead to small discrepancies between the data extracted from the
|
|
dataset and its real structure. On large-scale queries spanning many cells, this will not be a
|
|
problem.
|
|
|
|
While **many** libraries like *rasterio* exist for querying raster datasets at grid-like points or even
|
|
irregular points, none seems to allow for getting all raw data points of some datasets around some point
|
|
with some radius [2]. This was, however, required since we wanted to calculate statistics like "number of
|
|
points below sea level" for all data points around a given center point and radius. If we interpolated the
|
|
dataset to the center point, we would only get some local information and not cover the entire area within
|
|
the radius. The following libraries were investigated but did not provide the needed functionality:
|
|
|
|
- `GDALRasterBand tool from GDAL <https://gdal.org/api/gdalrasterband_cpp.html>`_
|
|
|
|
- `grdtrack tool from GMT <https://docs.generic-mapping-tools.org/latest/grdtrack.html>`_
|
|
or `its Python version <https://www.pygmt.org/latest/api/generated/pygmt.grdtrack.html>`_
|
|
|
|
- `geonum's topodata module <https://geonum.readthedocs.io/en/latest/api.html#module-geonum.topodata>`_
|
|
|
|
Args:
|
|
dataset: A :mod:`rasterio` raster dataset to read from or path to the file to open.
|
|
It must cover the entire planet with ``(0, 0)`` degrees being in the center.
|
|
raster_band_index: the index of band (the "layer") of the raster dataset (*GDAL*/*rasterio*
|
|
terminology)
|
|
|
|
Attributes:
|
|
dataset: the underlying :mod:`rasterio` dataset; read-only
|
|
raster_band_index: the index of band (the "layer") of the raster dataset (*GDAL*/*rasterio*
|
|
terminology); read-only
|
|
|
|
References:
|
|
- [1] This concept is also called grid- vs cell-registration. See also
|
|
`this Earth Science post <https://earthscience.stackexchange.com/q/4868/10692>`__.
|
|
- [2] Some `kind answers <https://lists.osgeo.org/pipermail/gdal-dev/2020-December/053125.html>`__
|
|
on the *gdal-dev* mailing list that did not help.
|
|
"""
|
|
|
|
#: The bounding box that any dataset wrapped by this class must match
|
|
_DATASET_BOUNDING_BOX = rasterio.coords.BoundingBox(left=-180.0, bottom=-90.0, right=180.0, top=90.0)
|
|
|
|
def __init__(self, dataset: Union[str, rasterio.DatasetReader], raster_band_index: int = 1) -> None:
|
|
self.dataset = rasterio.open(dataset) if isinstance(dataset, str) else dataset
|
|
|
|
self.raster_band_index = raster_band_index
|
|
|
|
self._data_array: ndarray
|
|
self._dataset_window = Window.from_slices(
|
|
rows=(0, self.dataset.height),
|
|
cols=(0, self.dataset.width),
|
|
)
|
|
|
|
assert allclose(DataSetAccess._DATASET_BOUNDING_BOX, self.dataset.bounds, atol=1e-12), (
|
|
"the dataset needs to cover the entire planet with (0, 0) degrees being in the center but was "
|
|
+ repr(self.dataset.bounds)
|
|
)
|
|
|
|
def get_bounding_windows_around(
|
|
self, center_latitude: float, center_longitude: float, radius: float
|
|
) -> Tuple[Window, Optional[Window]]:
|
|
"""Computes a bounding boxes/windows around the given center containing all points within the radius.
|
|
|
|
This method will return one bounding box per coordinate pair for *most* locations on earth.
|
|
However, near longitude +180°/-180° the dataset wraps around at the side, and as such, a bounding box
|
|
used for querying the dataset contains one or two such bounding boxes. Due to the internal design of
|
|
numpy, such a "wrapping" query cannot be expressed in a single slice [1] [2]. Thus, this method
|
|
might return one or two windows.
|
|
Correspondingly, :meth:`~.lat_lon_meshgrid_for` and :meth:`~.data_for` take one or two windows each.
|
|
|
|
Args:
|
|
center_latitude: The latitude of center of the box, in radians
|
|
center_longitude: The longitude of center of the box, in radians
|
|
radius: The radius around the center that the box should include. Strictly positive, in meters.
|
|
|
|
Returns:
|
|
One or two integer bounding windows as a tuple; might be slightly overestimated (by design)
|
|
|
|
References:
|
|
- [1] Some `answers <https://lists.osgeo.org/pipermail/gdal-dev/2020-December/053132.html>`_
|
|
on the *gdal-dev* mailing list that did not help
|
|
|
|
- [2] Numpy docs on
|
|
`Basic Slicing and Indexing
|
|
<https://numpy.org/doc/stable/reference/arrays.indexing.html#basic-slicing-and-indexing>`_
|
|
"""
|
|
|
|
# pylint: disable=too-many-locals
|
|
|
|
assert radius > 0.0, "radius must be strictly positive"
|
|
|
|
center_longitude = math.degrees(center_longitude)
|
|
center_latitude = math.degrees(center_latitude)
|
|
|
|
delta_lat = math.degrees(meters2rad(radius)) # uniform across the globe
|
|
assert delta_lat > 0.0, "is the input in radians?"
|
|
|
|
# Better slightly overestimate it by using the earth circumference at the equator
|
|
# That is about 1% larger than mean circumference
|
|
earth_circumference_at_lat = math.cos(math.radians(center_latitude)) * MAXIMUM_EARTH_CIRCUMFERENCE
|
|
delta_lon = radius / earth_circumference_at_lat * 360.0
|
|
assert delta_lon > 0.0, "is the input in radians?"
|
|
|
|
# Top & bottom are simple: just clip the latitude at the poles
|
|
# Left & right are a bit trickier since that possibly requires the creation of two slices
|
|
|
|
# These four coordinates determine the primary window
|
|
left = center_longitude - delta_lon
|
|
bottom = max(center_latitude - delta_lat, -90.0)
|
|
right = center_longitude + delta_lon
|
|
top = min(center_latitude + delta_lat, +90.0)
|
|
|
|
# `additional_window` determines the extra query if wrapping near longitude (+/-) 180° occurs
|
|
# `window` is geographically more west-ward than `additional_window`, if the latter exists
|
|
# Keep in mind though, that the common border might lie on +/- 180° longitude and thus on the
|
|
# border of the dataset/data array
|
|
window: Window
|
|
additional_window: Optional[Window]
|
|
|
|
# Handle the horizontal overflow of the window
|
|
# This also handles the case where 2*radius is larger than the width of the dataset
|
|
if left < -180.0: # Overflow on the left
|
|
overshoot = clip(-(left + 180), 0.0, 360.0)
|
|
left_wrapped = +180 - overshoot
|
|
# It might be the case that it also overflows on the right if the overall radius was so large
|
|
# that the window(s) would wrap around the world more than once. This can especially happen
|
|
# at high latitudes, where horizontally wrapping around the globe can happen at arbitrarily small
|
|
# radii/window sizes near the poles. We thus clip it to (in sum) only cover the world once.
|
|
right = clip(right, -180.0, left_wrapped)
|
|
|
|
# If the bounds overflow on the left, make the wrapped (i.e. the non-clipped) one the primary
|
|
# window, as it is geographically more west-ward
|
|
window = self.dataset.window(left_wrapped, bottom, +180.0, top)
|
|
additional_window = self.dataset.window(
|
|
-180.0, bottom, right, top
|
|
) # Possibly a window with zero width
|
|
|
|
elif right > +180.0: # Overflow on the right
|
|
overshoot = clip(right - 180, 0.0, 360.0)
|
|
right_wrapped = -180 + overshoot
|
|
# See the previous case "Overflow on the left" for an analogous explanation
|
|
left = clip(left, right_wrapped, +180.0)
|
|
|
|
# If the bounds overflow on the right, make the clipped (i.e. the non-wrapped) one the primary
|
|
# window, as it is geographically more west-ward
|
|
window = self.dataset.window(left, bottom, +180.0, top)
|
|
|
|
# `right_wrapped == -180` similar to above cannot occur here since then we must have landed in the
|
|
# `left < -180.0` branch instead
|
|
assert right_wrapped > -180, "The window would extend zero meters from east to west"
|
|
|
|
additional_window = self.dataset.window(-180.0, bottom, right_wrapped, top)
|
|
|
|
else: # No overflow at the bounds occurred, so we only need one `window`
|
|
window = self.dataset.window(left, bottom, right, top)
|
|
additional_window = None
|
|
|
|
# Jointly round the window(s) to integers and return the result
|
|
return self._round_windows_ceil(window, additional_window)
|
|
|
|
def _round_windows_ceil(
|
|
self, window_1: Window, window_2: Optional[Window]
|
|
) -> Tuple[Window, Optional[Window]]:
|
|
"""Rounds one or two windows to integer types and avoids an overlap to be created.
|
|
|
|
Always rounds to the larger windows if non-integer bounds are given. This guarantees that at least all
|
|
points initially given as the window(s) are also included in the resulting windows.
|
|
|
|
The actual rounding is done in :func:`rasterio.windows.window_index`.
|
|
|
|
Args:
|
|
window_1: The left window
|
|
window_2: An optional right window (geographically touches the eastern/right side of
|
|
``window_1``). Keep in mind though, that the common border might lie on +/- 180°
|
|
longitude and thus on the border of the dataset/data array.
|
|
|
|
Returns:
|
|
One or two windows, rounded to :class:`int` values.
|
|
Due to rounding, this method may return only a single window even if two were initially provided.
|
|
"""
|
|
(_, (_, w1_old_right)) = window_1.toranges()
|
|
|
|
# Round the first window
|
|
# The actual rounding is done in :func:`rasterio.windows.window_index`
|
|
window_1 = Window.from_slices(*cast(Tuple[slice, slice], rasterio.windows.window_index(window_1)))
|
|
# The rounding may move it beyond the bounds of the dataset, so clip it at the array borders
|
|
window_1 = window_1.intersection(self._dataset_window)
|
|
|
|
if window_2 is not None:
|
|
# Adjust `window_2` in the correct direction for a possibly created overlap
|
|
# Afterward, round it too
|
|
|
|
# Unpack the bounds that we will work with
|
|
((w1_top, w1_bottom), (_, w1_right)) = window_1.toranges()
|
|
(_, (left, right)) = window_2.toranges()
|
|
|
|
# Correct for the horizontal change that was induced by enlarging the `window_1`
|
|
# This will make `window_2` smaller if their common boundary was not already on a cell border
|
|
left += w1_right - w1_old_right
|
|
|
|
# Round away from the existing `window_1`, i.e. to the right/geographically west-ward
|
|
left = int(math.ceil(left))
|
|
right = int(math.ceil(right))
|
|
|
|
# There is a 1-cell overlap between the windows that was created by rounding, i.e.
|
|
# ``right == w1_left``. Therefore, we cut one index off.
|
|
right -= 1
|
|
|
|
# The case ``left == w1_right`` cannot occur since the second window is always guaranteed to be to
|
|
# the right of the first. We still check that though:
|
|
assert (
|
|
left - w1_right
|
|
) % self.dataset.width == 0, "this can never happen if the second window is truly to the right"
|
|
|
|
# Make sure that the extra window is non-empty and if not, just discard it
|
|
if right - left <= 0:
|
|
window_2 = None
|
|
|
|
else:
|
|
# We simply adopt the top and bottom bounds as they are the same in both windows
|
|
window_2 = Window.from_slices(rows=(w1_top, w1_bottom), cols=(left, right))
|
|
|
|
# May become obsolete if https://github.com/mapbox/rasterio/pull/2090 gets accepted
|
|
def to_int(win: Window) -> Window:
|
|
((float_top, float_bottom), (float_left, float_right)) = win.toranges()
|
|
return Window.from_slices(
|
|
rows=(int(float_top), int(float_bottom)), cols=(int(float_left), int(float_right))
|
|
)
|
|
|
|
return to_int(window_1), window_2 if window_2 is None else to_int(window_2)
|
|
|
|
def _lat_lon_meshgrid_single(self, window: Window, radians: bool) -> Tuple[ndarray, ndarray]:
|
|
"""Creates a meshgrid with all coordinates the data set has entries for in the given window.
|
|
|
|
Args:
|
|
window: as returned by :meth:`~get_bounding_windows_around`
|
|
radians: if ``True`` return in radians, else in degrees
|
|
|
|
Returns:
|
|
A latitude, longitude meshgrid matching the data returned by :meth:`~_data_single`
|
|
"""
|
|
|
|
# These values are in degrees
|
|
longitude_left, latitude_up = self.dataset.xy(window.row_off, window.col_off)
|
|
longitude_right, latitude_down = self.dataset.xy(
|
|
window.row_off + window.height, window.col_off + window.width
|
|
)
|
|
|
|
if radians:
|
|
longitude_left = math.radians(longitude_left)
|
|
latitude_up = math.radians(latitude_up)
|
|
longitude_right = math.radians(longitude_right)
|
|
latitude_down = math.radians(latitude_down)
|
|
|
|
coords_lat = linspace(latitude_up, latitude_down, window.height)
|
|
coords_lon = linspace(longitude_left, longitude_right, window.width)
|
|
|
|
coords_lat, coords_lon = meshgrid(coords_lat, coords_lon, indexing="ij")
|
|
assert coords_lat.shape == coords_lon.shape
|
|
|
|
return coords_lat, coords_lon
|
|
|
|
def lat_lon_meshgrid_for(
|
|
self,
|
|
window: Window,
|
|
additional_window: Optional[Window],
|
|
radians: bool,
|
|
) -> Tuple[ndarray, ndarray]:
|
|
"""Creates a meshgrid with all coordinates the data set has entries for in the given windows.
|
|
|
|
Args:
|
|
window: as returned by :meth:`~get_bounding_windows_around`
|
|
additional_window: as returned by :meth:`~get_bounding_windows_around`
|
|
radians: if ``True`` return in radians, else in degrees
|
|
|
|
Returns:
|
|
A single latitude, longitude meshgrid matching the data returned by :meth:`~data_for`
|
|
"""
|
|
|
|
coords_lat, coords_lon = self._lat_lon_meshgrid_single(window, radians)
|
|
|
|
# append additional window (only if required)
|
|
if additional_window is not None:
|
|
coords_lat_additional, coords_lon_additional = self._lat_lon_meshgrid_single(
|
|
additional_window, radians
|
|
)
|
|
coords_lat = hstack((coords_lat, coords_lat_additional))
|
|
coords_lon = hstack((coords_lon, coords_lon_additional))
|
|
|
|
return coords_lat, coords_lon
|
|
|
|
def _data_single(self, window: Window) -> ndarray:
|
|
"""Get all data points within the given window.
|
|
|
|
Notes:
|
|
The type of the data that is read depends on the dataset that is used.
|
|
See ``self.dataset.dtypes``.
|
|
|
|
Warnings:
|
|
Never modify the data returned by this method directly! It is only a view into the raw data.
|
|
|
|
Args:
|
|
window: A window as returned by :meth:`~get_bounding_windows_around`
|
|
|
|
Returns:
|
|
The 2D data array matching the coordinates returned by :meth:`~_lat_lon_meshgrid_single`
|
|
"""
|
|
assert hasattr(self, "_data_array"), "DataSetAccess must be used as a context manager and be open"
|
|
|
|
# one could read by this:
|
|
# self.dataset.read(self.raster_band_index, window=window)
|
|
# however, this does not map the file into memory and is thus about 10x slower than directly using a
|
|
# numpy array
|
|
|
|
# keep in mind that the slice will create a view into the raw data, and not a copy
|
|
# this is intentional to make the data access fast
|
|
data: ndarray = self._data_array[window.toslices()]
|
|
assert data.shape == (window.height, window.width)
|
|
return data
|
|
|
|
def data_for(self, window: Window, additional_window: Optional[Window]) -> ndarray:
|
|
"""Get all data points within the given windows as a single array.
|
|
|
|
Notes:
|
|
The type of the data that is read depends on the dataset that is used.
|
|
See ``self.dataset.dtypes``.
|
|
|
|
Warnings:
|
|
Never modify the data returned by this method directly! It is only a view into the raw data.
|
|
|
|
Args:
|
|
window: as returned by :meth:`~get_bounding_windows_around`
|
|
additional_window: as returned by :meth:`~get_bounding_windows_around`
|
|
|
|
Returns:
|
|
The single 2D data array matching the coordinates returned by :meth:`~lat_lon_meshgrid_for`
|
|
"""
|
|
|
|
result = self._data_single(window)
|
|
|
|
# append additional window (only if required)
|
|
if additional_window is not None:
|
|
additional_result = self._data_single(additional_window)
|
|
result = hstack((result, additional_result))
|
|
|
|
return result
|
|
|
|
def __enter__(self) -> "DataSetAccess":
|
|
self.dataset.__enter__()
|
|
self._data_array = self.dataset.read(self.raster_band_index)
|
|
self._data_array.flags.writeable = False # make this read-only to prevent accidents
|
|
return self
|
|
|
|
def __exit__(self, exc_type, exc_val, exc_tb) -> Any:
|
|
del self._data_array
|
|
return self.dataset.__exit__(exc_type, exc_val, exc_tb)
|