HomeDocs-Technical WhitePaper43-EFT.WP.Data.DatasetCards v1.0

Chapter 18 Appendix: Dataset Card Template


I. Template Scope & Posture

—for YAML/JSON dataset cards. Keys use snake_case; citations use “Volume vX.Y:Anchor”; the unit system follows SI with check_dim=true. full skeleton and minimalProvide two drop-in templates—

II. Minimal Template (copy-paste ready)

# ===== Minimal Dataset Card (release-grade) =====

dataset_id: "<org.project.dataset>"

title: "<Human-readable Title>"

version: "v1.0"

summary: "<100–300 chars brief purpose, coverage, limitations>"

modality: ["time_series"] # radio|optical|image|time_series|text|tabular

sources: ["<doi:...>", "<dataset_id@vX.Y>"]

license: "<SPDX or policy>"

access: "open" # open|restricted|closed

provenance:

collection_method: "<simulation|survey|beamformed-array|...>"

time_coverage: "<YYYY-MM-DD..YYYY-MM-DD>"

# spatial_coverage / selection_bias optional

splits:

train: {count: 0, ratio: 0.8}

validation: {count: 0, ratio: 0.1}

test: {count: 0, ratio: 0.1}

policy: {leakage_guard: ["per-object"], freeze_indices: true}

metrology:

units: "SI"

c_ref: 299792458

angle_unit: "deg"

time_standard: "UTC"

check_dim: true

quality:

gates:

- {name:"leakage", metric:"leakage_rate", threshold:0.0}

checksums: {}

export_manifest:

version: "v1.0"

artifacts: []

references:

- "EFT.WP.Core.DataSpec v1.0:EXPORT"

- "EFT.WP.Core.Metrology v1.0:check_dim"

(References use “Volume vX.Y:Anchor” consistent with the export manifest.)


III. Full Skeleton Template (release-grade, with optional extensions)

# ===== Full Dataset Card Skeleton =====

dataset_id: "<org.project.dataset>"

title: "<Human-readable Title>"

version: "v1.0.0"

summary: "<purpose, scope, coverage, limitations (100–300 chars)>"

modality: ["radio","time_series"]

sources: ["<doi:...>","<eift.obs.base@v1.2>"]

license: "<SPDX or policy>"

access: "open" # open|restricted|closed

provenance:

collection_method: "<beamformed-array|drift-scan|survey-aggregation|simulation>"

instruments: [{name:"<string>", station:"<string>", role:"rx"}]

time_coverage: "<YYYY-MM-DD..YYYY-MM-DD>"

spatial_coverage: "<RA/Dec ranges | CRS:EPSG:4326>"

selection_bias: "<flux-limited, SNR>=7>"

permits: ["<ethics/permit-ref>"]

sampling:

strategy: "<random|stratified|systematic|time-based|spatial-tiles>"

strata: [{by:"class", buckets: {"A":100,"B":200}}]

rates: {train:0.8, validation:0.1, test:0.1}

seed: 1701

replacement: false

dedup_policy: "per-object"

audits: ["coverage","leakage","class-imbalance"]

preprocess:

pipeline_id: "<pipeline-name>"

steps:

- name: "rfi_clean"

enabled: true

idempotent: true

params: {method:"spectral-kurtosis", window:256, thr_sigma:5}

inputs: ["raw_spec"]

outputs: ["mask_spec"]

parameter_lock: true

randomness: {seed: 1701, libraries:{numpy:"1.26.4"}}

environment: {os:"ubuntu22.04", containers:["ghcr.io/eift/card-prep:1.0.2"]}

audits: ["nan-check","range-check","leakage"]

labels:

schema_version: "v1.0"

taxonomy:

root: "event"

nodes:

- {id:"FRB", parent:"event", kind:"class", definition:"fast radio burst"}

- {id:"RFI", parent:"event", kind:"artifact", definition:"radio frequency interference"}

class_map: {include:["FRB","RFI"], exclude: []}

encoding:

type: "multi_class" # multi_class|multi_label|hierarchical

policy:

positive_rules: ["explicit-evidence"]

negative_rules: ["contradiction-or-missing-signal"]

tie_breaker: "lowest-risk"

multilingual:

default_lang: "en"

map: {FRB:{en:"FRB", zh:"快速射电暴"}, RFI:{en:"RFI", zh:"射频干扰"}}

metrology:

units: "SI"

c_ref: 299792458

time_standard: "UTC"

angle_unit: "deg"

check_dim: true

# — Enable this block if path-dependent quantities (e.g., T_arr) are present —

path_dependence:

applies_to: ["T_arr"]

delta_form: "const-factor" # or "general"

path: "gamma(ell)"

measure: "d ell"

see:

- "EFT.WP.Core.Equations v1.1:S20-1"

- "EFT.WP.Core.Metrology v1.0:check_dim"

uncertainty:

model: "GUM" # GUM|bayesian|montecarlo

components:

- {name:"thermal", type:"random", value:2.1, unit:"K", distribution:"normal", coverage:{k:1.0}}

- {name:"cal_gain", type:"systematic", value:0.8, unit:"%", distribution:"normal", coverage:{k:2.0}, corr_group:"instrument"}

correlation: {posture:"groups", groups:[{name:"instrument", pairwise:"rho=0.6"}]}

propagation: {rule:"linear", linearization:"first-order", samples:0}

coverage_policy: {target_p:0.95, k:2.0}

splits:

train: {count: 0, ratio: 0.8}

validation: {count: 0, ratio: 0.1}

test: {count: 0, ratio: 0.1}

policy:

leakage_guard: ["per-object","per-timewindow"]

stratify_by: ["class","region"]

freeze_indices: true

audit:

coverage: {by:"class", report:true}

leakage: {cross_split:"forbid"}

distribution:

packaging: {format:"tgz", shard_bytes:134217728, layout:["train","validation","test"]}

mirrors: ["https://mirror-a.example/foo/","s3://bucket/foo/"]

rate_limit: {mbps: 50}

regional_compliance: ["EU-GDPR"] # if applicable

checksums:

package: {sha256: "<hex>"}

shards:

- {path:"train-000.tgz", sha256:"<hex>"}

quality:

gates:

- {name:"label_consistency", metric:"kappa", threshold:0.98}

- {name:"leakage", metric:"leakage_rate", threshold:0.0}

- {name:"coverage_min", metric:"split_coverage", threshold:0.99}

coverage:

samples: 0

per_class: {}

ci_method: "bootstrap-bca"

target_ci: 0.95

baseline:

tasks: [{name:"cls_frb_vs_rfi", type:"classification", split:"test"}]

metrics: [{name:"f1_macro"}, {name:"roc_auc"}, {name:"ece"}, {name:"brier"}]

eval_protocol:

splits: "frozen"

seeds: [0,1,2,3,4]

repeats: 5

ci: {method:"bootstrap-bca", level:0.95}

significance: {test:"permutation", alpha:0.05}

robustness: {shift_tests:["snr_drop","time_jitter","spec_notch"]}

privacy:

policy: "no-PII" # no-PII|limited-PII|special-category

lawful_basis: ["research"]

data_minimization: true

ethics:

intended_use: ["academic","benchmark"]

prohibited_use: ["surveillance"]

release:

channel: "stable" # alpha|beta|rc|stable|yanked

version: "v1.0.0"

date: "2025-09-20"

compatibility: {baseline:"v1.*", backwards:"minor"}

export_manifest:

version: "v1.0"

artifacts:

- {path:"splits/train.index", sha256:"..."}

- {path:"quality/summary.csv", sha256:"..."}

references:

- "EFT.WP.Core.DataSpec v1.0:EXPORT"

- "EFT.WP.Core.Metrology v1.0:check_dim"

- "EFT.WP.Core.Equations v1.1:S20-1"

(SI metrology and the citation-anchor format are mandatory.)


IV. Placeholder Hints & Minimal Regex (quick reference)


V. Export Manifest Template (normative)

export_manifest:

version: "v1.0"

artifacts:

- {path:"dataset_card.yaml", sha256:"<hex>"}

- {path:"splits/train.index", sha256:"<hex>"}

- {path:"packages/train-000.tgz", sha256:"<hex>"}

references:

- "EFT.WP.Core.DataSpec v1.0:EXPORT"

- "EFT.WP.Core.Metrology v1.0:check_dim"

(All artifacts must be verifiable; references carry “Volume+Version+Anchor”.)


VI. Pre-Release Blocking Self-Check (list)


VII. Machine-Readable Blank Template (no comments; CI-friendly)

dataset_id: ""

title: ""

version: "v1.0"

summary: ""

modality: []

sources: []

license: ""

access: "open"

provenance: {collection_method:"", time_coverage:""}

sampling: {}

preprocess: {pipeline_id:"", steps: [], parameter_lock: true}

labels: {}

metrology: {units:"SI", c_ref:299792458, time_standard:"UTC", angle_unit:"deg", check_dim:true}

uncertainty: {}

splits: {train:{count:0,ratio:0.8}, validation:{count:0,ratio:0.1}, test:{count:0,ratio:0.1}, policy:{leakage_guard:["per-object"], freeze_indices:true}}

distribution: {}

quality: {}

privacy: {}

ethics: {}

release: {channel:"stable", version:"v1.0.0", date:"2025-09-20", compatibility:{baseline:"v1.*", backwards:"minor"}}

export_manifest: {version:"v1.0", artifacts: [], references:["EFT.WP.Core.DataSpec v1.0:EXPORT","EFT.WP.Core.Metrology v1.0:check_dim"]}

(Ready for automation; fill incrementally as needed.)


Copyright & License (CC BY 4.0)

Copyright: Unless otherwise noted, the copyright of “Energy Filament Theory” (text, charts, illustrations, symbols, and formulas) belongs to the author “Guanglin Tu”.
License: This work is licensed under the Creative Commons Attribution 4.0 International (CC BY 4.0). You may copy, redistribute, excerpt, adapt, and share for commercial or non‑commercial purposes with proper attribution.
Suggested attribution: Author: “Guanglin Tu”; Work: “Energy Filament Theory”; Source: energyfilament.org; License: CC BY 4.0.

First published: 2025-11-11|Current version:v5.1
License link:https://creativecommons.org/licenses/by/4.0/