Skip to content

Commit 1fee49e

Browse files
committed
Write custom schemas to fiboa metadata for use in improve/merge/etc. #113 and minor fixes
1 parent 729bb10 commit 1fee49e

File tree

7 files changed

+51
-6
lines changed

7 files changed

+51
-6
lines changed

CHANGELOG.md

+6
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,12 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
1717
- rename columns
1818
- Converter for Switzerland
1919

20+
### Changed
21+
22+
- `fiboa convert` writes custom schemas to collection metadata
23+
- `fiboa validate` uses custom schemas for validation
24+
- `fiboa merge` keeps custom schemas when needed
25+
2026
## [v0.8.0] - 2024-11-12
2127

2228
### Added

fiboa_cli/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -601,7 +601,7 @@ def merge(datasets, out, crs, include, exclude, extension, compression, geoparqu
601601
type=click.STRING,
602602
help='Coordinate Reference System (CRS) to use for the GeoParquet file.',
603603
show_default=True,
604-
default=DEFAULT_CRS,
604+
default=None,
605605
)
606606
@click.option(
607607
'--compression', '-pc',

fiboa_cli/improve.py

+7-3
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,8 @@
11
import os
22

3-
from pyproj import CRS
4-
53
from .const import CORE_COLUMNS
64
from .parquet import create_parquet
7-
from .util import load_parquet_data, load_parquet_schema, log, parse_metadata
5+
from .util import load_parquet_data, load_parquet_schema, log, parse_metadata, pick_schemas
86

97

108
def improve(input, out = None, rename_columns = {}, add_sizes = False, fix_geometries = False, crs = None, compression = None, geoparquet1 = False):
@@ -64,6 +62,12 @@ def improve(input, out = None, rename_columns = {}, add_sizes = False, fix_geome
6462
gdf["area"] = gdf_m["area"].fillna(gdf_m.geometry.area * 0.0001)
6563
gdf["perimeter"] = gdf_m["perimeter"].fillna(gdf_m.geometry.length)
6664

65+
custom_schemas = collection.get("fiboa_custom_schemas", {})
66+
custom_schemas = pick_schemas(custom_schemas, columns, rename_columns)
67+
if len(custom_schemas) > 0:
68+
collection["fiboa_custom_schemas"] = custom_schemas
69+
70+
6771
# Write the merged dataset to the output file
6872
create_parquet(gdf, columns, collection, out, {}, compression=compression, geoparquet1=geoparquet1)
6973
log(f"Wrote data to {out}", "success")

fiboa_cli/merge.py

+12-2
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
from .const import CORE_COLUMNS
66
from .parquet import create_parquet
7-
from .util import load_parquet_data, load_parquet_schema, log, parse_metadata
7+
from .util import load_parquet_data, load_parquet_schema, log, merge_schemas, parse_metadata, pick_schemas
88
from .version import fiboa_version
99

1010
DEFAULT_CRS = "EPSG:4326"
@@ -20,22 +20,27 @@ def merge(datasets, out, crs = DEFAULT_CRS, includes = [], excludes = [], extens
2020

2121
# Load the datasets
2222
all_gdf = []
23+
custom_schemas = {}
2324
for dataset in datasets:
2425
# Load the dataset
2526
schema = load_parquet_schema(dataset)
27+
collection = parse_metadata(schema, b"fiboa")
2628
file_columns = list(set(columns) & set(schema.names))
2729
gdf = load_parquet_data(dataset, columns=file_columns)
2830

2931
# Change the CRS if necessary
3032
gdf.to_crs(crs=crs, inplace=True)
3133

3234
# Add collection column to each dataset
33-
collection = parse_metadata(schema, b"fiboa")
3435
if collection is not None and "id" in collection:
3536
gdf["collection"] = collection["id"]
3637
else:
3738
gdf["collection"] = os.path.splitext(os.path.basename(dataset))[0]
3839

40+
# Merge custom schemas
41+
custom_schema = collection.get("fiboa_custom_schemas", {})
42+
custom_schemas = merge_schemas(custom_schemas, custom_schema)
43+
3944
all_gdf.append(gdf)
4045

4146
merged = pd.concat(all_gdf, ignore_index=True)
@@ -51,6 +56,11 @@ def merge(datasets, out, crs = DEFAULT_CRS, includes = [], excludes = [], extens
5156
"fiboa_extensions": extensions,
5257
}
5358

59+
# Add custom schemas
60+
custom_schemas = pick_schemas(custom_schemas, columns)
61+
if len(custom_schemas) > 0:
62+
collection["fiboa_custom_schemas"] = custom_schemas
63+
5464
# Write the merged dataset to the output file
5565
create_parquet(merged, columns, collection, out, {}, compression=compression, geoparquet1=geoparquet1)
5666
log(f"Merged data to {out}", "success")

fiboa_cli/parquet.py

+5
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,11 @@ def create_parquet(data, columns, collection, output_file, config, missing_schem
1515
fiboa_schema = load_fiboa_schema(config)
1616
schemas = merge_schemas(missing_schemas, fiboa_schema)
1717

18+
# Add the custom schemas to the collection for future use
19+
if len(missing_schemas) > 0:
20+
collection = collection.copy()
21+
collection["fiboa_custom_schemas"] = missing_schemas
22+
1823
# Load all extension schemas
1924
extensions = {}
2025
if "fiboa_extensions" in collection and isinstance(collection["fiboa_extensions"], list):

fiboa_cli/util.py

+16
Original file line numberDiff line numberDiff line change
@@ -276,6 +276,22 @@ def merge_schemas(*schemas):
276276
return result
277277

278278

279+
def pick_schemas(schemas, properties, rename = {}):
280+
"""Pick and rename schemas for specific properties"""
281+
result = {
282+
"required": [],
283+
"properties": {}
284+
}
285+
for prop in properties:
286+
prop2 = rename[prop] if prop in rename else prop
287+
if prop in schemas["required"]:
288+
result["required"].append(prop2)
289+
if prop in schemas["properties"]:
290+
result["properties"][prop2] = schemas["properties"][prop]
291+
292+
return result
293+
294+
279295
def migrate_schema(schema):
280296
"""Migrate schema to a new version"""
281297
return schema.copy()

fiboa_cli/validate.py

+4
Original file line numberDiff line numberDiff line change
@@ -207,6 +207,10 @@ def validate_parquet(file, config):
207207
for ext in extensions.values():
208208
schemas = merge_schemas(schemas, ext)
209209

210+
# Add custom schemas
211+
custom_schemas = collection.get("fiboa_custom_schemas", {})
212+
schemas = merge_schemas(schemas, custom_schemas)
213+
210214
# Check that all required fields are present
211215
for key in schemas.get("required", []):
212216
if key not in parquet_schema.names:

0 commit comments

Comments
 (0)