-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcioos_ocads_xml_updater.py
154 lines (139 loc) · 6.01 KB
/
cioos_ocads_xml_updater.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import sys
import yaml
from lxml import etree
from collections import defaultdict
from metadata_xml.metadata_xml.template_functions import metadata_to_xml
namespaces = {
"gmi": "http://www.isotc211.org/2005/gmi",
"gmd": "http://www.isotc211.org/2005/gmd",
"gco": "http://www.isotc211.org/2005/gco",
"gml": "http://www.opengis.net/gml",
"xlink": "http://www.w3.org/1999/xlink"
}
def extract_text(el):
return el.text.strip() if el is not None and el.text else None
def extract_bbox(root):
bbox = root.find(".//gmd:identificationInfo/*/gmd:extent//gmd:EX_GeographicBoundingBox", namespaces)
return [
float(extract_text(bbox.find("gmd:westBoundLongitude/gco:Decimal", namespaces))),
float(extract_text(bbox.find("gmd:southBoundLatitude/gco:Decimal", namespaces))),
float(extract_text(bbox.find("gmd:eastBoundLongitude/gco:Decimal", namespaces))),
float(extract_text(bbox.find("gmd:northBoundLatitude/gco:Decimal", namespaces)))
] if bbox is not None else None
def extract_keywords(root):
keywords = root.findall(".//gmd:descriptiveKeywords/gmd:MD_Keywords", namespaces)
result = defaultdict(list)
for kw in keywords:
title_el = kw.find(".//gmd:thesaurusName/gmd:CI_Citation/gmd:title/gco:CharacterString", namespaces)
title = extract_text(title_el).lower() if title_el is not None else "default"
keyword_els = kw.findall(".//gmd:keyword/gco:CharacterString", namespaces)
words = [extract_text(k) for k in keyword_els if extract_text(k)]
if "eov" in title:
category = "eov"
elif "taxa" in title or "taxonomic" in title:
category = "taxa"
else:
category = "default"
result[category].extend(words)
return {k: sorted(set(v)) for k, v in result.items()}
def extract_contacts(root):
contacts = []
for c in root.findall(".//gmd:contact/gmd:CI_ResponsibleParty", namespaces):
role = extract_text(c.find(".//gmd:role/gmd:CI_RoleCode", namespaces))
org_name = extract_text(c.find(".//gmd:organisationName/gco:CharacterString", namespaces))
email = extract_text(c.find(".//gmd:contactInfo//gmd:address//gmd:electronicMailAddress/gco:CharacterString", namespaces))
contact = {
"roles": [role] if role else [],
"organization": {"name": org_name, "email": email}
}
contacts.append(contact)
return contacts
def parse_xml_to_record(xml_path):
tree = etree.parse(xml_path)
root = tree.getroot()
identifier = extract_text(root.find(".//gmd:fileIdentifier/gco:CharacterString", namespaces))
language = extract_text(root.find(".//gmd:language/gmd:LanguageCode", namespaces))
title = extract_text(root.find(".//gmd:identificationInfo/*/gmd:citation/gmd:CI_Citation/gmd:title/gco:CharacterString", namespaces))
abstract = extract_text(root.find(".//gmd:identificationInfo/*/gmd:abstract/gco:CharacterString", namespaces))
record = {
"metadata": {
"naming_authority": "ca.cioos",
"identifier": identifier,
"language": "en",
"maintenance_note": "auto-generated",
"use_constraints": {
"limitations": {
"en": "limitations in english",
"fr": "limitations in french",
"translations": {"fr": {"validated": False, "message": "Auto-translated using AWS"}}
},
"licence": {
"title": "Creative Commons Attribution 4.0",
"code": "CC-BY-4.0",
"url": "https://creativecommons.org/licenses/by/4.0/"
}
},
"comment": {
"en": "auto-generated comment",
"fr": "auto-generated comment",
},
"dates": {
"creation": "2024-01-01"
},
"scope": "model"
},
"spatial": {
"bbox": extract_bbox(root),
"polygon": "polygon_data",
"description": "description of the spatial extent or study area of the dataset",
"descriptionIdentifier": "A2345-FS323-DG434-345DG",
"vertical": [0, 10],
"vertical_positive": "down"
},
"identification": {
"title": {
"en": title,
"fr": f"{title} (FR)",
"translations": {"fr": {"validated": False, "message": "Auto-translated using AWS"}}
},
"abstract": {
"en": abstract,
"fr": f"{abstract} (FR)",
"translations": {"fr": {"validated": False, "message": "Auto-translated using AWS"}}
},
"keywords": extract_keywords(root),
"temporal_begin": "1950-07-31",
"temporal_end": "now",
"temporal_duration": "P1D",
"time_coverage_resolution": "P1D",
"acknowledgement": "acknowledgement",
"status": "onGoing",
"project": {
"en": ["project_a", "project_b"],
"fr": ["project_a in french", "project_b in french"]
},
},
"contact": extract_contacts(root),
"distribution": [
{"url": u} for u in root.xpath(".//gmd:onLine/gmd:CI_OnlineResource/gmd:linkage/gmd:URL/text()", namespaces=namespaces)
]
}
return record
if __name__ == "__main__":
if len(sys.argv) != 3:
print("Usage: python script.py input.xml output.yaml")
sys.exit(1)
input_xml = sys.argv[1]
output_yaml = sys.argv[2]
record = parse_xml_to_record(input_xml)
from lxml import etree
xml = metadata_to_xml(record)
# Save YAML
with open(output_yaml, "w") as f:
yaml.dump(record, f, allow_unicode=True, sort_keys=False)
# Save XML
output_xml = output_yaml.replace(".yaml", ".xml")
with open(output_xml, "w", encoding="utf-8") as f:
f.write(xml)
print(f"YAML metadata written to {output_yaml}")
print(f"ISO XML metadata written to {output_xml}")