mirror of
https://github.com/SamyRai/turash.git
synced 2025-12-26 23:01:33 +00:00
- Initialize git repository - Add comprehensive .gitignore for Go projects - Install golangci-lint v2.6.0 (latest v2) globally - Configure .golangci.yml with appropriate linters and formatters - Fix all formatting issues (gofmt) - Fix all errcheck issues (unchecked errors) - Adjust complexity threshold for validation functions - All checks passing: build, test, vet, lint
342 lines
14 KiB
Markdown
342 lines
14 KiB
Markdown
## 4. Data Model (Schema / Ontology)
|
|
|
|
The platform uses a structured data model to represent industrial resources, businesses, sites, and relationships. Each resource is captured with *what*, *how much*, *quality*, *when*, and *where* attributes.
|
|
|
|
### Core Schemas
|
|
|
|
**ResourceFlow** - Represents resource inputs, outputs, and services at specific sites:
|
|
*See [schemas/resource_flow.json](./schemas/resource_flow.json) for complete schema definition*
|
|
|
|
**Site** - Represents physical locations and buildings where business activities occur:
|
|
*See [schemas/site.json](./schemas/site.json) for complete schema definition*
|
|
|
|
**Business** - Represents legal/commercial entities and their operational capabilities:
|
|
*See [schemas/business.json](./schemas/business.json) for complete schema definition*
|
|
|
|
**SharedAsset** - Equipment and infrastructure at specific sites that can be shared among businesses:
|
|
*See [schemas/shared_asset.json](./schemas/shared_asset.json) for complete schema definition*
|
|
|
|
Businesses then just publish `ResourceFlow` objects to the system.
|
|
|
|
### Neo4j Graph Database Schema
|
|
|
|
#### Node Schemas
|
|
```cypher
|
|
// Business Node
|
|
CREATE CONSTRAINT business_id_unique IF NOT EXISTS
|
|
FOR (b:Business) REQUIRE b.id IS UNIQUE;
|
|
|
|
CREATE CONSTRAINT business_email_unique IF NOT EXISTS
|
|
FOR (b:Business) REQUIRE b.email IS UNIQUE;
|
|
|
|
// Site Node
|
|
CREATE CONSTRAINT site_id_unique IF NOT EXISTS
|
|
FOR (s:Site) REQUIRE s.id IS UNIQUE;
|
|
|
|
// ResourceFlow Node
|
|
CREATE CONSTRAINT resource_flow_id_unique IF NOT EXISTS
|
|
FOR (rf:ResourceFlow) REQUIRE rf.id IS UNIQUE;
|
|
|
|
// Indexes for performance
|
|
CREATE INDEX business_name_index IF NOT EXISTS FOR (b:Business) ON (b.name);
|
|
CREATE INDEX site_location_index IF NOT EXISTS FOR (s:Site) ON (s.latitude, s.longitude);
|
|
CREATE INDEX resource_flow_type_direction_index IF NOT EXISTS
|
|
FOR (rf:ResourceFlow) ON (rf.type, rf.direction);
|
|
CREATE INDEX resource_flow_quality_temp_index IF NOT EXISTS
|
|
FOR (rf:ResourceFlow) ON (rf.temperature_celsius);
|
|
```
|
|
|
|
#### Relationship Schemas
|
|
```cypher
|
|
// Core Relationships
|
|
CALL apoc.schema.assert({}, {
|
|
'Business-[:OPERATES_AT]->Site': {},
|
|
'Site-[:HOSTS]->ResourceFlow': {},
|
|
'ResourceFlow-[:MATCHABLE_TO]->ResourceFlow': {},
|
|
'Business-[:OFFERS]->Service': {},
|
|
'Business-[:SELLS]->Product': {},
|
|
'Site-[:HOSTS]->SharedAsset': {},
|
|
'Business-[:TRUSTS]->Business': {}
|
|
});
|
|
```
|
|
|
|
#### Node Property Constraints
|
|
```cypher
|
|
// Business Node Properties
|
|
CREATE CONSTRAINT business_required_properties IF NOT EXISTS
|
|
FOR (b:Business) REQUIRE (b.id, b.name, b.email) IS NOT NULL;
|
|
|
|
// Site Node Properties
|
|
CREATE CONSTRAINT site_required_properties IF NOT EXISTS
|
|
FOR (s:Site) REQUIRE (s.id, s.name, s.latitude, s.longitude) IS NOT NULL;
|
|
|
|
// ResourceFlow Node Properties
|
|
CREATE CONSTRAINT resource_flow_required_properties IF NOT EXISTS
|
|
FOR (rf:ResourceFlow) REQUIRE (rf.id, rf.business_id, rf.site_id, rf.direction, rf.type) IS NOT NULL;
|
|
```
|
|
|
|
### PostgreSQL Spatial Database Schema
|
|
|
|
#### Core Tables
|
|
```sql
|
|
-- Enable required extensions
|
|
CREATE EXTENSION IF NOT EXISTS "uuid-ossp";
|
|
CREATE EXTENSION IF NOT EXISTS "postgis";
|
|
|
|
-- Business table
|
|
CREATE TABLE businesses (
|
|
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
|
|
name VARCHAR(255) NOT NULL,
|
|
legal_form VARCHAR(50),
|
|
primary_contact_email VARCHAR(255) NOT NULL,
|
|
primary_contact_phone VARCHAR(50),
|
|
industrial_sector VARCHAR(10), -- NACE code
|
|
company_size INTEGER,
|
|
years_operation INTEGER,
|
|
supply_chain_role VARCHAR(50),
|
|
certifications JSONB DEFAULT '[]',
|
|
business_focus JSONB DEFAULT '[]',
|
|
strategic_vision TEXT,
|
|
drivers_barriers TEXT,
|
|
readiness_maturity INTEGER CHECK (readiness_maturity BETWEEN 1 AND 5),
|
|
created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
|
|
updated_at TIMESTAMP WITH TIME ZONE DEFAULT NOW()
|
|
);
|
|
|
|
-- Site table with spatial index
|
|
CREATE TABLE sites (
|
|
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
|
|
name VARCHAR(255) NOT NULL,
|
|
address TEXT,
|
|
latitude DECIMAL(10,8) NOT NULL,
|
|
longitude DECIMAL(11,8) NOT NULL,
|
|
location GEOGRAPHY(POINT, 4326), -- PostGIS spatial column
|
|
site_type VARCHAR(50),
|
|
floor_area_m2 DECIMAL(12,2),
|
|
ownership VARCHAR(50),
|
|
owner_business_id UUID REFERENCES businesses(id),
|
|
available_utilities JSONB DEFAULT '[]',
|
|
parking_spaces INTEGER,
|
|
loading_docks INTEGER,
|
|
crane_capacity_tonnes DECIMAL(8,2),
|
|
energy_rating VARCHAR(100),
|
|
waste_management JSONB DEFAULT '[]',
|
|
environmental_impact TEXT,
|
|
created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
|
|
updated_at TIMESTAMP WITH TIME ZONE DEFAULT NOW()
|
|
);
|
|
|
|
-- Resource flows table
|
|
CREATE TABLE resource_flows (
|
|
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
|
|
business_id UUID NOT NULL REFERENCES businesses(id),
|
|
site_id UUID NOT NULL REFERENCES sites(id),
|
|
direction VARCHAR(10) NOT NULL CHECK (direction IN ('input', 'output')),
|
|
type VARCHAR(50) NOT NULL,
|
|
-- Quality parameters
|
|
temperature_celsius DECIMAL(6,2),
|
|
pressure_bar DECIMAL(8,2),
|
|
purity_pct DECIMAL(5,2),
|
|
grade VARCHAR(100),
|
|
hazardousness BOOLEAN DEFAULT FALSE,
|
|
composition TEXT,
|
|
physical_state VARCHAR(20) CHECK (physical_state IN ('solid', 'liquid', 'gas')),
|
|
-- Quantity parameters
|
|
amount DECIMAL(15,4),
|
|
unit VARCHAR(50),
|
|
temporal_unit VARCHAR(50),
|
|
variability DECIMAL(4,3),
|
|
-- Time profile
|
|
availability JSONB DEFAULT '{}',
|
|
seasonality JSONB DEFAULT '[]',
|
|
supply_pattern VARCHAR(50),
|
|
-- Economic data
|
|
cost_in DECIMAL(10,4), -- €/unit
|
|
cost_out DECIMAL(10,4), -- €/unit
|
|
waste_disposal_cost DECIMAL(10,4),
|
|
primary_input_cost DECIMAL(10,4),
|
|
transportation_cost DECIMAL(8,4), -- €/km
|
|
cost_sharing_fraction DECIMAL(3,2),
|
|
-- Constraints
|
|
max_distance_km DECIMAL(8,2),
|
|
requires_permit BOOLEAN DEFAULT FALSE,
|
|
min_quality_threshold TEXT,
|
|
regulatory_compliance BOOLEAN DEFAULT TRUE,
|
|
-- Precision level
|
|
precision_level VARCHAR(20) DEFAULT 'estimated'
|
|
CHECK (precision_level IN ('rough', 'estimated', 'measured')),
|
|
-- Validation
|
|
source_type VARCHAR(50) DEFAULT 'declared'
|
|
CHECK (source_type IN ('declared', 'device', 'calculated')),
|
|
device_signature VARCHAR(255), -- For verified device data
|
|
created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
|
|
updated_at TIMESTAMP WITH TIME ZONE DEFAULT NOW()
|
|
);
|
|
|
|
-- Versioned resource flows for temporal tracking
|
|
CREATE TABLE resource_flow_versions (
|
|
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
|
|
resource_flow_id UUID NOT NULL REFERENCES resource_flows(id),
|
|
version_number INTEGER NOT NULL,
|
|
changes JSONB NOT NULL, -- What changed
|
|
changed_by UUID, -- User who made change
|
|
change_reason TEXT,
|
|
valid_from TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
|
|
valid_to TIMESTAMP WITH TIME ZONE,
|
|
created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
|
|
UNIQUE(resource_flow_id, version_number)
|
|
);
|
|
```
|
|
|
|
#### Indexes and Constraints
|
|
```sql
|
|
-- Spatial index for location-based queries
|
|
CREATE INDEX idx_sites_location ON sites USING GIST (location);
|
|
|
|
-- Performance indexes
|
|
CREATE INDEX idx_resource_flows_business_site ON resource_flows (business_id, site_id);
|
|
CREATE INDEX idx_resource_flows_type_direction ON resource_flows (type, direction);
|
|
CREATE INDEX idx_resource_flows_temperature ON resource_flows (temperature_celsius);
|
|
CREATE INDEX idx_resource_flows_precision ON resource_flows (precision_level);
|
|
CREATE INDEX idx_resource_flows_updated ON resource_flows (updated_at);
|
|
|
|
-- Partial indexes for common queries
|
|
CREATE INDEX idx_resource_flows_output_heat ON resource_flows (site_id, temperature_celsius)
|
|
WHERE direction = 'output' AND type = 'heat';
|
|
|
|
CREATE INDEX idx_resource_flows_input_heat ON resource_flows (site_id, temperature_celsius)
|
|
WHERE direction = 'input' AND type = 'heat';
|
|
|
|
-- JSONB indexes for complex queries
|
|
CREATE INDEX idx_sites_utilities ON sites USING GIN (available_utilities);
|
|
CREATE INDEX idx_businesses_certifications ON businesses USING GIN (certifications);
|
|
```
|
|
|
|
### Data Model Enhancements
|
|
|
|
#### Multi-Tenancy Support
|
|
|
|
**Recommendation**: Support multi-tenancy from day one, even if single-tenant initially.
|
|
|
|
**Approach**: **Hybrid**
|
|
- One graph per region/municipality (e.g., one per industrial park or district)
|
|
- Graph federation for cross-region queries
|
|
- Tenant_id on all nodes for future cross-tenant analytics (aggregated)
|
|
|
|
#### Data Versioning and Audit Trail
|
|
|
|
**Implementation**:
|
|
- **Event Sourcing**: Store all changes as events
|
|
- **Snapshots**: Periodically create snapshots for fast current-state queries
|
|
- **Audit Log**: Complete history of who changed what and when
|
|
|
|
**Entities Requiring Versioning**:
|
|
- ResourceFlow (quantity, quality changes affect matches)
|
|
- Business (certifications, capabilities)
|
|
- Site (location, infrastructure changes)
|
|
- Match (status changes: proposed → accepted → implemented)
|
|
|
|
#### Temporal Data Handling
|
|
|
|
**Architecture**:
|
|
- **Graph Database**: Current state, relationships, metadata
|
|
- **Time-Series Database**: Historical ResourceFlow measurements, load curves, seasonal patterns
|
|
- **Integration**: Sync aggregated time-series data to graph nodes as properties
|
|
|
|
**Use Cases**:
|
|
- Historical analysis: "What was heat demand last winter?"
|
|
- Pattern detection: Identify recurring availability patterns
|
|
- Forecasting: Predict future resource availability
|
|
- Load curves: Real-time meter data integration
|
|
|
|
#### Data Quality and Validation
|
|
|
|
**Layers**:
|
|
1. **Input Validation**: Schema validation (JSON Schema, Zod, Pydantic)
|
|
2. **Business Logic Validation**: Quality ranges, quantity constraints
|
|
3. **Cross-Entity Validation**: Ensure Site belongs to Business, ResourceFlow belongs to Site
|
|
4. **Data Quality Metrics**: Completeness, accuracy, consistency scores
|
|
|
|
**Implementation**:
|
|
- Validation at API layer using `github.com/go-playground/validator/v10`
|
|
- Background data quality jobs (Go workers)
|
|
- User-facing data quality dashboard
|
|
- Automated data quality reports
|
|
|
|
#### Data Precision Levels & Privacy Tiers
|
|
|
|
**Precision Levels** (allow rough data without blocking adoption):
|
|
- `rough (±50%)`: Ballpark estimates for initial matching
|
|
- `estimated (±20%)`: Calculated from known processes
|
|
- `measured (±5%)`: From meters/IoT devices
|
|
|
|
**Matching Engine**: Weighs "measured" matches higher but includes rough estimates in results.
|
|
|
|
**Privacy Tiers & Ownership Model** (GDPR-compliant, prevents data resale):
|
|
|
|
**Visibility Matrix by Stakeholder Type**:
|
|
- **Individual Companies**: See potential matches only (anonymized: "Company X in sector Y, 5km away")
|
|
- **Cities/Municipalities**: Aggregate cluster views (total waste heat available, CO₂ potential) - no individual company data
|
|
- **Utilities**: Network topology data for planning (pipe routes, capacity) - no commercial pricing
|
|
- **Platform**: Raw data for matching only (never shared, GDPR processor role)
|
|
|
|
**Privacy Tiers** (differential privacy applied):
|
|
- `public`: Resource type, rough quantity range, location cluster (visible to all platform users)
|
|
- `network-only`: Detailed specs, pricing (visible only to potential matches after mutual opt-in)
|
|
- `private`: Full technical details, ERP feeds (visible only to platform after k-anonymization)
|
|
|
|
**Data Ownership Rules**:
|
|
- **Company Data**: Companies retain full ownership - can revoke visibility, delete data, export at any time
|
|
- **Utility Data**: Platform acts as processor only - utilities cannot resell or commercialize data
|
|
- **Municipal Data**: Cities get read-only aggregates - individual facility data never exposed
|
|
- **Platform Data**: Matching algorithms and aggregated analytics (anonymized)
|
|
|
|
**GDPR/DPA Compliance Layer**:
|
|
- **Legal Basis**: Contractual necessity for service provision, legitimate interest for anonymized analytics
|
|
- **Data Processing Agreement**: Standard DPA template for all enterprise customers
|
|
- **Anonymization Pipeline**: k-anonymity for sensitive flows (minimum 5 similar entities)
|
|
- **Right to Erasure**: Full data deletion within 30 days, cascade to all historical matches
|
|
- **Data Portability**: Export facility data in structured format (JSON/CSV)
|
|
|
|
**Structured History Storage** (Data Moat & Analytics Foundation):
|
|
- **Versioned Resource Profiles**: Time-stamped snapshots of all resource declarations with validity periods
|
|
- **Match Attempt Logging**: Complete audit trail of all matching attempts, scores, and outcomes
|
|
- **Failure Intelligence Layer**: Structured reasons for failed/declined matches (distance, capex, legal, unwilling)
|
|
- **Economic Snapshot Preservation**: Frozen prices, volumes, assumptions at time of calculation
|
|
- **Implementation Tracking**: Status pipeline (proposed→accepted→technical→economic→legal→capex→operational)
|
|
|
|
**History Storage Architecture**:
|
|
```sql
|
|
-- Layer 1: Always versioned (MRV/analytics foundation)
|
|
resource_profile_history (
|
|
id, facility_id, resource_type, payload_json,
|
|
valid_from_ts, valid_to_ts, source, quality_flag
|
|
)
|
|
|
|
-- Layer 2: Match intelligence (algorithm training)
|
|
match_attempt_log (
|
|
id, timestamp, candidate_a_id, candidate_b_id,
|
|
engine_version, score, outcome, outcome_reason_code
|
|
)
|
|
|
|
-- Layer 3: Event telemetry (optional, aggregate-only)
|
|
sensor_snapshot (id, facility_id, timestamp, readings_json)
|
|
marketplace_transaction (id, timestamp, buyer_id, seller_id, value)
|
|
```
|
|
|
|
**History Benefits**:
|
|
- **MRV Compliance**: Prove before/after reductions, time-series validation
|
|
- **Algorithm Improvement**: Failed matches inform better scoring models
|
|
- **Policy Intelligence**: "If we relax temp limits by 5°C, unlock 11 more matches"
|
|
- **Municipal Analytics**: Quarterly CO₂ dashboards with trend analysis
|
|
- **Sales Intelligence**: Demonstrate value over time to prospects
|
|
|
|
**Validation Layer**:
|
|
- **Device-signed flows**: `source = device:modbus:123` (trusted, labeled as "verified")
|
|
- **Declared flows**: Manual entry (labeled as "estimated," lower matching priority)
|
|
- **Versioned resources**: Time-stamped ResourceFlow versions for temporal analysis and delta matching
|
|
|
|
---
|
|
|
|
|