File size: 6,870 Bytes
eee232c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9e08f1d
eee232c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
# openenv.yaml
# ─────────────────────────────────────────────────────────────────────────────
# Manifest for the Data Cleaning Pipeline OpenEnv environment.
#
# Field reference
# ───────────────
# Required by the CLI (serve / build / push / validate):
#   spec_version  β€” always 1 for this generation of the spec
#   name          β€” environment identifier used by the CLI and auto-discovery
#   type          β€” "space" means it can be deployed as a Hugging Face Space
#   runtime       β€” "fastapi" tells the server how to boot
#   app           β€” Python import path to the FastAPI app object
#   port          β€” port the server listens on inside the container
#
# Read by AutoEnv auto-discovery (openenv.auto._discovery):
#   name          β€” maps to env_key after stripping the "_env" suffix
#   description   β€” human-readable label shown in env listings
#   spec_version  β€” stored in EnvironmentInfo for introspection
#   action        β€” EXPLICIT override of the auto-inferred class name
#   observation   β€” EXPLICIT override of the auto-inferred class name
#
# NOTE on action / observation overrides:
#   Auto-discovery infers class names from the env name using PascalCase:
#     "data_cleaning_env" β†’ base "data_cleaning" β†’ "CleanAction"
#   Our actual class is named "CleanAction" (not "CleanAction"),
#   so these fields MUST be set to avoid ImportError on AutoEnv.from_env().
#
# All other fields (tasks, reward, tags) are informational.  They are not
# parsed by the current OpenEnv tooling but are preserved in
# EnvironmentInfo.manifest and available to the web UI and external tools.
# ─────────────────────────────────────────────────────────────────────────────

# ── Core deployment fields ────────────────────────────────────────────────────

spec_version: 1
name: data_cleaning_env
type: space
runtime: fastapi
app: server.app:app
port: 7680

# ── Package metadata ──────────────────────────────────────────────────────────

version: "1.0.0"

description: >-
  Data cleaning pipeline: the agent receives a dirty CSV and must detect
  and fix type errors, missing values, outliers, and schema inconsistencies
  to match a hidden ground-truth dataset. Three tasks (easy β†’ medium β†’ hard)
  with a deterministic grader that returns a continuous score in [0.0, 1.0].

# ── Auto-discovery class overrides ───────────────────────────────────────────
# These override auto-inferred names (which would be CleanAction /
# CleanAction) to match the actual class names defined in models.py.

action: CleanAction
observation: CleanObservation

# The client class is correctly inferred as DataCleaningEnv (data_cleaning β†’
# DataCleaning + Env), which matches client.py, so no override is needed.

# ── Tags (informational) ──────────────────────────────────────────────────────

tags:
  - data-cleaning
  - tabular
  - real-world
  - hackathon

# ── Task manifest (informational) ─────────────────────────────────────────────
# One entry per task. These values mirror the constants in models.py
# (MAX_STEPS, DONE_THRESHOLD) and the descriptions in dataset_factory.py.

tasks:
  - id: easy
    name: Fix obvious errors
    description: >-
      50-row sales CSV with 29 injected dirty cells: 10 type mismatches
      (text in numeric columns), 8 missing values, 5 far-future dates
      (year 2099), and 6 cells with leading/trailing whitespace.
      Graded by exact cell-level match against the ground truth (0.0–1.0).
    dataset_rows: 50
    dirty_cells: 29
    max_steps: 40
    done_threshold: 0.95

  - id: medium
    name: Outlier detection without false positives
    description: >-
      200-row customer transaction CSV with 15 true statistical outliers
      (negative or > $2000 amounts) that must be fixed or removed, 5 valid
      large transactions ($900–$2000) that must NOT be removed, and 12
      category spelling typos. Graded by F1 score on outlier detection
      (0.5 weight) and typo correction rate (0.5 weight).
    dataset_rows: 200
    dirty_cells: 27
    max_steps: 80
    done_threshold: 0.85

  - id: hard
    name: Multi-source schema normalisation and deduplication
    description: >-
      430-row CSV (400 clean + 30 duplicates) merged from 3 fictional data
      sources with inconsistent column naming (e.g. cust_id / customer_id /
      CustomerID), mixed date formats (ISO, US, EU), and ~30 duplicate rows
      (exact and near-duplicate). Agent must infer the canonical 9-column
      schema without explicit documentation. Graded by schema match (40%),
      deduplication F1 (35%), and date format compliance (25%).
    dataset_rows: 430
    canonical_rows: 400
    canonical_columns: 9
    duplicate_rows: 30
    max_steps: 150
    done_threshold: 0.80

# ── Reward function summary (informational) ───────────────────────────────────

reward:
  type: dense
  range: [-0.5, 1.0]
  step_cost: -0.005
  components:
    - name: progress
      weight: primary
      description: >-
        Grader score delta each step (curr_score βˆ’ prev_score).
        The main learning signal β€” any cell fixed produces a non-zero reward.

    - name: efficiency_bonus
      weight: "+0.10 Γ— (1 βˆ’ step_fraction)"
      description: >-
        Small bonus awarded the step the episode is solved (score crosses
        done_threshold). Rewards finishing early relative to the step budget.

    - name: false_positive_penalty
      weight: -0.15
      description: >-
        Applied when DROP_ROW removes a valid-extreme row in the medium task.
        Penalises aggressive deletion without checking schema_hint.

    - name: early_done_penalty
      weight: -0.20
      description: >-
        Applied when the agent sends DONE with current_score < 0.60.
        Discourages giving up prematurely.

    - name: step_cost
      weight: -0.005
      description: >-
        Fixed cost every step regardless of outcome.
        Prevents infinite loops and padding.