Random Spatial Sampling
&
Watershed Analysis
This code demonstrates how to implement a stratified random sampling approach for watershed data using Python, GeoPandas, and related libraries. It randomly generates sample points within polygon boundaries at two different watershed levels (HUC8 and HUC12) and then summarizes soil attributes (e.g., aws0150) for each watershed. By comparing the mean values obtained from each sampling regime, you can evaluate how different spatial scales impact the results of environmental analyses. The script is designed to be reusable, featuring a function-based structure for flexibility in handling various numbers and levels of watersheds.
import geopandas as gpd
import pandas as pd
import random
from shapely.geometry import Point
# Randomly generate points within the extent of the polygon using random seed 0
def generate_random_points_in_polygon(poly, num_points, seed=0):
"""
Generate num_points random points within a single polygon (poly).
Return a list of shapely.geometry.Point objects.
Parameters
----------
poly : shapely.geometry.Polygon
The polygon within which to generate random points.
num_points : int
Number of random points to place inside the polygon.
seed : int
Random seed for reproducibility.
Returns
-------
list of shapely.geometry.Point
Random points contained entirely within the polygon.
"""
random.seed(seed)
# Get the extent of the polygon
minx, miny, maxx, maxy = poly.bounds
points = []
while len(points) < num_points:
x = random.uniform(minx, maxx)
y = random.uniform(miny, maxy)
# Test if each point is within the polygon boundary
candidate_point = Point(x, y)
if candidate_point.within(poly):
points.append(candidate_point)
return points
def main():
"""
Steps:
1. Reads HUC8, HUC12, and SSURGO data from a GeoPackage.
2. Calculates polygon area to determine number of random sample points.
3. Generates points within HUC8 and HUC12 polygons.
4. Joins these points with SSURGO data to retrieve aws0150 values.
5. Computes and prints mean aws0150 by HUC8 watershed for both sampling approaches.
6. Prints conclusions on the differences in results between HUC8- and HUC12-based sampling.
"""
# Read input data
huc8 = gpd.read_file("lab4.gpkg", layer="wdbhuc8")
huc12 = gpd.read_file("lab4.gpkg", layer="wdbhuc12")
ssurgo = gpd.read_file("lab4.gpkg", layer="ssurgo_mapunits_lab3")
# Get the area of the polygon, and convert it to square kilometers
# Calculate the number of points using a point density of 0.05 points/sqkm. Round to the nearest integer
huc8['area_km2'] = huc8.geometry.area / 1e6
huc8['n_points'] = (huc8['area_km2'] * 0.05).round().astype(int)
huc12['area_km2'] = huc12.geometry.area / 1e6
huc12['n_points'] = (huc12['area_km2'] * 0.05).round().astype(int)
# If so, encode the point and indicate the HUC8 watershed that contains the point (HUC8)
huc8_point_records = []
for idx, row in huc8.iterrows():
poly = row.geometry
huc8_id = row["HUC8"]
n = row["n_points"]
if n > 0:
pts = generate_random_points_in_polygon(poly, n, seed=0)
for pt in pts:
huc8_point_records.append({
"geometry": pt,
"huc8_id": huc8_id
})
gdf_points_huc8 = gpd.GeoDataFrame(huc8_point_records, crs=huc8.crs)
# If so, encode the point and indicate the HUC8 watershed that contains the point (HUC12 polygons but store HUC8 ID by slicing the first 8 digits)
huc12_point_records = []
for idx, row in huc12.iterrows():
poly = row.geometry
huc12_code = row["HUC12"]
huc8_id = huc12_code[:8]
n = row["n_points"]
if n > 0:
pts = generate_random_points_in_polygon(poly, n, seed=0)
for pt in pts:
huc12_point_records.append({
"geometry": pt,
"huc8_id": huc8_id
})
gdf_points_huc12 = gpd.GeoDataFrame(huc12_point_records, crs=huc12.crs)
# Spatial join the points with SSURGO polygons to get aws0150
points_huc8_with_aws = gpd.sjoin(
gdf_points_huc8,
ssurgo[['aws0150', 'geometry']],
how='left',
predicate='within'
)
points_huc12_with_aws = gpd.sjoin(
gdf_points_huc12,
ssurgo[['aws0150', 'geometry']],
how='left',
predicate='within'
)
# Calculate the mean aws0150 for each HUC8 watershed from the HUC8- and HUC12-based
# point samples, i.e., you will have three mean values for each point sample.
means_huc8_sample = (
points_huc8_with_aws.groupby('huc8_id')['aws0150']
.mean()
.reset_index()
.rename(columns={'aws0150': 'mean_aws0150_huc8sample'})
)
means_huc12_sample = (
points_huc12_with_aws.groupby('huc8_id')['aws0150']
.mean()
.reset_index()
.rename(columns={'aws0150': 'mean_aws0150_huc12sample'})
)
# Report the mean values for each HUC8 watershed from the two samples and my
# conclusions regarding the different results of sampling within HUC8 versus
# HUC12 watersheds. Print out six total mean values (three for the HUC8-based
# point sample and three for the HUC12-based sample)
compare = pd.merge(means_huc8_sample, means_huc12_sample, on='huc8_id', how='outer')
print("\n==================== FINAL RESULTS ====================")
print("Mean available water storage 0-150cm by HUC8 Watershed\n(from two sampling approaches)\n")
print(compare.to_string(index=False))
print("\n==================== CONCLUSIONS =====================")
conclusion_text = (
"Overall, the largest difference occurs for HUC8 10190005,\n"
"where the HUC8-based average exceeds the HUC12-based value\n"
"by more than a full unit. Meanwhile, HUC8 10190006 shows a\n"
"slightly higher mean from HUC12-based sampling, and HUC8\n"
"10190007 differs only marginally between the two methods.\n"
"Such variations can arise because finer subdivisions at\n"
"the HUC12 level capture different localized soil conditions\n"
"(thus shifting the overall mean), whereas sampling directly\n"
"within larger HUC8 polygons produces a broader, more\n"
"generalized average.\n"
)
print(conclusion_text)
if __name__ == "__main__":
main()