Random Spatial Sampling 

Watershed Analysis

This code demonstrates how to implement a stratified random sampling approach for watershed data using Python, GeoPandas, and related libraries. It randomly generates sample points within polygon boundaries at two different watershed levels (HUC8 and HUC12) and then summarizes soil attributes (e.g., aws0150) for each watershed. By comparing the mean values obtained from each sampling regime, you can evaluate how different spatial scales impact the results of environmental analyses. The script is designed to be reusable, featuring a function-based structure for flexibility in handling various numbers and levels of watersheds.

import geopandas as gpd

import pandas as pd

import random

from shapely.geometry import Point


# Randomly generate points within the extent of the polygon using random seed 0

def generate_random_points_in_polygon(poly, num_points, seed=0):

    """

    Generate num_points random points within a single polygon (poly).

    Return a list of shapely.geometry.Point objects.

   

    Parameters

    ----------

    poly : shapely.geometry.Polygon

        The polygon within which to generate random points.

    num_points : int

        Number of random points to place inside the polygon.

    seed : int

        Random seed for reproducibility.

       

    Returns

    -------

    list of shapely.geometry.Point

        Random points contained entirely within the polygon.

    """

    random.seed(seed)

    # Get the extent of the polygon

    minx, miny, maxx, maxy = poly.bounds

    points = []

    while len(points) < num_points:

        x = random.uniform(minx, maxx)

        y = random.uniform(miny, maxy)

        # Test if each point is within the polygon boundary

        candidate_point = Point(x, y)

        if candidate_point.within(poly):

            points.append(candidate_point)

    return points


def main():

    """

    Steps:

    1. Reads HUC8, HUC12, and SSURGO data from a GeoPackage.

    2. Calculates polygon area to determine number of random sample points.

    3. Generates points within HUC8 and HUC12 polygons.

    4. Joins these points with SSURGO data to retrieve aws0150 values.

    5. Computes and prints mean aws0150 by HUC8 watershed for both sampling approaches.

    6. Prints conclusions on the differences in results between HUC8- and HUC12-based sampling.

    """

    # Read input data

    huc8 = gpd.read_file("lab4.gpkg", layer="wdbhuc8")

    huc12 = gpd.read_file("lab4.gpkg", layer="wdbhuc12")

    ssurgo = gpd.read_file("lab4.gpkg", layer="ssurgo_mapunits_lab3")


    # Get the area of the polygon, and convert it to square kilometers

    # Calculate the number of points using a point density of 0.05 points/sqkm. Round to the nearest integer

    huc8['area_km2'] = huc8.geometry.area / 1e6

    huc8['n_points'] = (huc8['area_km2'] * 0.05).round().astype(int)


    huc12['area_km2'] = huc12.geometry.area / 1e6

    huc12['n_points'] = (huc12['area_km2'] * 0.05).round().astype(int)


    # If so, encode the point and indicate the HUC8 watershed that contains the point (HUC8)

    huc8_point_records = []

    for idx, row in huc8.iterrows():

        poly = row.geometry

        huc8_id = row["HUC8"]

        n = row["n_points"]

       

        if n > 0:

            pts = generate_random_points_in_polygon(poly, n, seed=0)

            for pt in pts:

                huc8_point_records.append({

                    "geometry": pt,

                    "huc8_id": huc8_id

                })

   

    gdf_points_huc8 = gpd.GeoDataFrame(huc8_point_records, crs=huc8.crs)


    # If so, encode the point and indicate the HUC8 watershed that contains the point (HUC12 polygons but store HUC8 ID by slicing the first 8 digits)

    huc12_point_records = []

    for idx, row in huc12.iterrows():

        poly = row.geometry

        huc12_code = row["HUC12"]

        huc8_id = huc12_code[:8]

        n = row["n_points"]

       

        if n > 0:

            pts = generate_random_points_in_polygon(poly, n, seed=0)

            for pt in pts:

                huc12_point_records.append({

                    "geometry": pt,

                    "huc8_id": huc8_id

                })

   

    gdf_points_huc12 = gpd.GeoDataFrame(huc12_point_records, crs=huc12.crs)


    # Spatial join the points with SSURGO polygons to get aws0150

    points_huc8_with_aws = gpd.sjoin(

        gdf_points_huc8,

        ssurgo[['aws0150', 'geometry']],

        how='left',

        predicate='within'

    )

    points_huc12_with_aws = gpd.sjoin(

        gdf_points_huc12,

        ssurgo[['aws0150', 'geometry']],

        how='left',

        predicate='within'

    )


    # Calculate the mean aws0150 for each HUC8 watershed from the HUC8- and HUC12-based

    # point samples, i.e., you will have three mean values for each point sample.

    means_huc8_sample = (

        points_huc8_with_aws.groupby('huc8_id')['aws0150']

        .mean()

        .reset_index()

        .rename(columns={'aws0150': 'mean_aws0150_huc8sample'})

    )


    means_huc12_sample = (

        points_huc12_with_aws.groupby('huc8_id')['aws0150']

        .mean()

        .reset_index()

        .rename(columns={'aws0150': 'mean_aws0150_huc12sample'})

    )


    # Report the mean values for each HUC8 watershed from the two samples and my

    # conclusions regarding the different results of sampling within HUC8 versus

    # HUC12 watersheds. Print out six total mean values (three for the HUC8-based

    # point sample and three for the HUC12-based sample)

    compare = pd.merge(means_huc8_sample, means_huc12_sample, on='huc8_id', how='outer')


    print("\n==================== FINAL RESULTS ====================")

    print("Mean available water storage 0-150cm by HUC8 Watershed\n(from two sampling approaches)\n")

    print(compare.to_string(index=False))


    print("\n==================== CONCLUSIONS =====================")

    conclusion_text = (

        "Overall, the largest difference occurs for HUC8 10190005,\n"

        "where the HUC8-based average exceeds the HUC12-based value\n"

        "by more than a full unit. Meanwhile, HUC8 10190006 shows a\n"

        "slightly higher mean from HUC12-based sampling, and HUC8\n"

        "10190007 differs only marginally between the two methods.\n"

        "Such variations can arise because finer subdivisions at\n"

        "the HUC12 level capture different localized soil conditions\n"

        "(thus shifting the overall mean), whereas sampling directly\n"

        "within larger HUC8 polygons produces a broader, more\n"

        "generalized average.\n"

    )

    print(conclusion_text)



if __name__ == "__main__":

    main()