From e23395aeb229d9bac20dcea30737031dc674775e Mon Sep 17 00:00:00 2001 From: Zhongwei Li Date: Sun, 30 Nov 2025 08:55:25 +0800 Subject: [PATCH] Initial commit --- .claude-plugin/plugin.json | 12 + README.md | 3 + SKILL.md | 263 +++++++ plugin.lock.json | 105 +++ references/abap-integration.md | 483 +++++++++++++ references/additional-features.md | 394 +++++++++++ references/data-workflow.md | 362 ++++++++++ references/dtl-functions.md | 879 ++++++++++++++++++++++++ references/graphs-pipelines.md | 494 +++++++++++++ references/ml-scenario-manager.md | 582 ++++++++++++++++ references/modeling-advanced.md | 387 +++++++++++ references/operators-reference.md | 440 ++++++++++++ references/replication-flows.md | 379 ++++++++++ references/security-cdc.md | 287 ++++++++ references/structured-data-operators.md | 470 +++++++++++++ references/subengines.md | 562 +++++++++++++++ templates/basic-graph.json | 100 +++ templates/ml-training-pipeline.json | 99 +++ templates/replication-flow.json | 90 +++ 19 files changed, 6391 insertions(+) create mode 100644 .claude-plugin/plugin.json create mode 100644 README.md create mode 100644 SKILL.md create mode 100644 plugin.lock.json create mode 100644 references/abap-integration.md create mode 100644 references/additional-features.md create mode 100644 references/data-workflow.md create mode 100644 references/dtl-functions.md create mode 100644 references/graphs-pipelines.md create mode 100644 references/ml-scenario-manager.md create mode 100644 references/modeling-advanced.md create mode 100644 references/operators-reference.md create mode 100644 references/replication-flows.md create mode 100644 references/security-cdc.md create mode 100644 references/structured-data-operators.md create mode 100644 references/subengines.md create mode 100644 templates/basic-graph.json create mode 100644 templates/ml-training-pipeline.json create mode 100644 templates/replication-flow.json diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json new file mode 100644 index 0000000..f011aae --- /dev/null +++ b/.claude-plugin/plugin.json @@ -0,0 +1,12 @@ +{ + "name": "sap-hana-cloud-data-intelligence", + "description": "Data processing pipelines and ML scenarios in SAP Data Intelligence Cloud. Covers graph operators, ABAP/S4HANA integration, replication flows, and JupyterLab.", + "version": "1.0.0", + "author": { + "name": "Zhongwei Li", + "email": "zhongweili@tubi.tv" + }, + "skills": [ + "./" + ] +} \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..d4bceff --- /dev/null +++ b/README.md @@ -0,0 +1,3 @@ +# sap-hana-cloud-data-intelligence + +Data processing pipelines and ML scenarios in SAP Data Intelligence Cloud. Covers graph operators, ABAP/S4HANA integration, replication flows, and JupyterLab. diff --git a/SKILL.md b/SKILL.md new file mode 100644 index 0000000..ef5dcba --- /dev/null +++ b/SKILL.md @@ -0,0 +1,263 @@ +--- +name: sap-hana-cloud-data-intelligence +description: | + Develops data processing pipelines, integrations, and machine learning scenarios in SAP Data Intelligence Cloud. Use when building graphs/pipelines with operators, integrating ABAP/S4HANA systems, creating replication flows, developing ML scenarios with JupyterLab, or using Data Transformation Language functions. Covers Gen1/Gen2 operators, subengines (Python, Node.js, C++), structured data operators, and repository objects. +license: GPL-3.0 +metadata: + version: "1.0.0" + last_verified: "2025-11-27" +--- + +# SAP HANA Cloud Data Intelligence Skill + +This skill provides comprehensive guidance for developing with SAP Data Intelligence Cloud, including pipeline creation, operator development, data integration, and machine learning scenarios. + +## When to Use This Skill + +Use this skill when: + +- Creating or modifying data processing graphs/pipelines +- Developing custom operators (Gen1 or Gen2) +- Integrating ABAP-based SAP systems (S/4HANA, BW) +- Building replication flows for data movement +- Developing ML scenarios with ML Scenario Manager +- Working with JupyterLab in Data Intelligence +- Using Data Transformation Language (DTL) functions +- Configuring subengines (Python, Node.js, C++) +- Working with structured data operators + +## Core Concepts + +### Graphs (Pipelines) + +Graphs are networks of operators connected via typed input/output ports for data transfer. + +**Two Generations:** +- **Gen1 Operators**: Legacy operators, broad compatibility +- **Gen2 Operators**: Enhanced error recovery, state management, snapshots + +**Critical Rule**: Graphs cannot mix Gen1 and Gen2 operators - choose one generation per graph. + +**Gen2 Advantages:** +- Automatic error recovery with snapshots +- State management with periodic checkpoints +- Native multiplexing (one-to-many, many-to-one) +- Improved Python3 operator + +### Operators + +Building blocks that process data within graphs. Each operator has: +- **Ports**: Typed input/output connections for data flow +- **Configuration**: Parameters that control behavior +- **Runtime**: Engine that executes the operator + +**Operator Categories:** +1. Messaging (Kafka, MQTT, NATS) +2. Storage (Files, HDFS, S3, Azure, GCS) +3. Database (HANA, SAP BW, SQL) +4. Script (Python, JavaScript, R, Go) +5. Data Processing (Transform, Anonymize, Validate) +6. Machine Learning (TensorFlow, PyTorch, HANA ML) +7. Integration (OData, REST, SAP CPI) +8. Workflow (Pipeline, Data Workflow) + +### Subengines + +Subengines enable operators to run on different runtimes within the same graph. + +**Supported Subengines:** +- **ABAP**: For ABAP Pipeline Engine operators +- **Python 3.9**: For Python-based operators +- **Node.js**: For JavaScript-based operators +- **C++**: For high-performance native operators + +**Key Benefit**: Connected operators on the same subengine run in a single OS process for optimal performance. + +**Trade-off**: Cross-engine communication requires serialization/deserialization overhead. + +## Quick Start Patterns + +### Basic Graph Creation + +``` +1. Open SAP Data Intelligence Modeler +2. Create new graph +3. Add operators from repository +4. Connect operator ports (matching types) +5. Configure operator parameters +6. Validate graph +7. Execute and monitor +``` + +### Replication Flow Pattern + +``` +1. Create replication flow in Modeler +2. Configure source connection (ABAP, HANA, etc.) +3. Configure target (HANA Cloud, S3, Kafka, etc.) +4. Add tasks with source objects +5. Define filters and mappings +6. Validate flow +7. Deploy to tenant repository +8. Run and monitor +``` + +**Delivery Guarantees:** +- Default: At-least-once (may have duplicates) +- With UPSERT to databases: Exactly-once +- For cloud storage: Use "Suppress Duplicates" option + +### ML Scenario Pattern + +``` +1. Open ML Scenario Manager from launchpad +2. Create new scenario +3. Add datasets (register data sources) +4. Create Jupyter notebooks for experiments +5. Build training pipelines +6. Track metrics with Metrics Explorer +7. Version scenario for reproducibility +8. Deploy model pipeline +``` + +## Common Tasks + +### ABAP System Integration + +For integrating ABAP-based SAP systems: + +1. **Prerequisites**: Configure Cloud Connector for on-premise systems +2. **Connection Setup**: Create ABAP connection in Connection Management +3. **Metadata Access**: Use Metadata Explorer for object discovery +4. **Data Sources**: CDS Views, ODP (Operational Data Provisioning), Tables + +**Reference**: See `references/abap-integration.md` for detailed setup. + +### Structured Data Processing + +Use structured data operators for SQL-like transformations: + +- **Data Transform**: Visual SQL editor for complex transformations +- **Aggregation Node**: GROUP BY with aggregation functions +- **Join Node**: INNER, LEFT, RIGHT, FULL joins +- **Projection Node**: Column selection and renaming +- **Union Node**: Combine multiple datasets +- **Case Node**: Conditional logic + +**Reference**: See `references/structured-data-operators.md` for configuration. + +### Data Transformation Language + +DTL provides SQL-like functions for data processing: + +**Function Categories:** +- String: CONCAT, SUBSTRING, UPPER, LOWER, TRIM, REPLACE +- Numeric: ABS, CEIL, FLOOR, ROUND, MOD, POWER +- Date/Time: ADD_DAYS, MONTHS_BETWEEN, EXTRACT, CURRENT_UTCTIMESTAMP +- Conversion: TO_DATE, TO_STRING, TO_INTEGER, TO_DECIMAL +- Miscellaneous: CASE, COALESCE, IFNULL, NULLIF + +**Reference**: See `references/dtl-functions.md` for complete reference. + +## Best Practices + +### Graph Design + +1. **Choose Generation Early**: Decide Gen1 vs Gen2 before building +2. **Minimize Cross-Engine Communication**: Group operators by subengine +3. **Use Appropriate Port Types**: Match data types for efficient transfer +4. **Enable Snapshots**: For Gen2 graphs, enable auto-recovery +5. **Validate Before Execution**: Always validate graphs + +### Operator Development + +1. **Start with Built-in Operators**: Use predefined operators first +2. **Extend When Needed**: Create custom operators for specific needs +3. **Use Script Operators**: For quick prototyping with Python/JS +4. **Version Your Operators**: Track changes with operator versions +5. **Document Configuration**: Describe all parameters + +### Replication Flows + +1. **Plan Target Schema**: Understand target structure requirements +2. **Use Filters**: Reduce data volume with source filters +3. **Handle Duplicates**: Configure for exactly-once when possible +4. **Monitor Execution**: Track progress and errors +5. **Clean Up Artifacts**: Remove source artifacts after completion + +### ML Scenarios + +1. **Version Early**: Create versions before major changes +2. **Track All Metrics**: Use SDK for comprehensive tracking +3. **Use Notebooks for Exploration**: JupyterLab for experimentation +4. **Productionize with Pipelines**: Convert notebooks to pipelines +5. **Export/Import for Migration**: Use ZIP export for transfers + +## Error Handling + +### Common Graph Errors + +| Error | Cause | Solution | +|-------|-------|----------| +| Port type mismatch | Incompatible data types | Use converter operator or matching types | +| Gen1/Gen2 mixing | Combined operator generations | Use single generation per graph | +| Resource exhaustion | Insufficient memory/CPU | Adjust resource requirements | +| Connection failure | Network or credentials | Verify connection settings | +| Validation errors | Invalid configuration | Review error messages, fix config | + +### Recovery Strategies + +**Gen2 Graphs:** +- Enable automatic recovery in graph settings +- Configure snapshot intervals +- Monitor recovery status + +**Gen1 Graphs:** +- Implement manual error handling in operators +- Use try-catch in script operators +- Configure retry logic + +## Reference Files + +For detailed information, see: + +- `references/operators-reference.md` - Complete operator catalog (266 operators) +- `references/abap-integration.md` - ABAP/S4HANA/BW integration with SAP Notes +- `references/structured-data-operators.md` - Structured data processing +- `references/dtl-functions.md` - Data Transformation Language (79 functions) +- `references/ml-scenario-manager.md` - ML Scenario Manager, SDK, artifacts +- `references/subengines.md` - Python, Node.js, C++ subengine development +- `references/graphs-pipelines.md` - Graph execution, snapshots, recovery +- `references/replication-flows.md` - Replication flows, cloud storage, Kafka +- `references/data-workflow.md` - Data workflow operators, orchestration +- `references/security-cdc.md` - Security, data protection, CDC methods +- `references/additional-features.md` - Monitoring, cloud storage services, scenario templates, data types, Git terminal +- `references/modeling-advanced.md` - Graph snippets, SAP cloud apps, configuration types, 141 graph templates + +## Templates + +Starter templates are available in `templates/`: + +- `templates/basic-graph.json` - Simple data processing graph +- `templates/replication-flow.json` - Data replication pattern +- `templates/ml-training-pipeline.json` - ML training workflow + +## Documentation Links + +**Primary Sources:** +- GitHub Docs: [https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/tree/main/docs](https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/tree/main/docs) +- SAP Help Portal: [https://help.sap.com/docs/SAP_DATA_INTELLIGENCE](https://help.sap.com/docs/SAP_DATA_INTELLIGENCE) +- SAP Developer Center: [https://developers.sap.com/topics/data-intelligence.html](https://developers.sap.com/topics/data-intelligence.html) + +**Section-Specific:** +- Modeling Guide: [https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/tree/main/docs/modelingguide](https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/tree/main/docs/modelingguide) +- ABAP Integration: [https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/tree/main/docs/abapintegration](https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/tree/main/docs/abapintegration) +- Machine Learning: [https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/tree/main/docs/machinelearning](https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/tree/main/docs/machinelearning) +- Function Reference: [https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/tree/main/docs/functionreference](https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/tree/main/docs/functionreference) +- Repository Objects: [https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/tree/main/docs/repositoryobjects](https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/tree/main/docs/repositoryobjects) + +## Version Information + +- **Skill Version**: 1.0.0 +- **Last Updated**: 2025-11-27 +- **Documentation Source**: SAP-docs/sap-hana-cloud-data-intelligence (GitHub) diff --git a/plugin.lock.json b/plugin.lock.json new file mode 100644 index 0000000..96bd4a4 --- /dev/null +++ b/plugin.lock.json @@ -0,0 +1,105 @@ +{ + "$schema": "internal://schemas/plugin.lock.v1.json", + "pluginId": "gh:secondsky/sap-skills:skills/sap-hana-cloud-data-intelligence", + "normalized": { + "repo": null, + "ref": "refs/tags/v20251128.0", + "commit": "1be219f1b36372577a12f714193bab81e9ef167f", + "treeHash": "b0d49a52297195782a9dc2ba0d363bf8c1712c7a7d45a3327a832dcc0553a7b7", + "generatedAt": "2025-11-28T10:28:14.055341Z", + "toolVersion": "publish_plugins.py@0.2.0" + }, + "origin": { + "remote": "git@github.com:zhongweili/42plugin-data.git", + "branch": "master", + "commit": "aa1497ed0949fd50e99e70d6324a29c5b34f9390", + "repoRoot": "/Users/zhongweili/projects/openmind/42plugin-data" + }, + "manifest": { + "name": "sap-hana-cloud-data-intelligence", + "description": "Data processing pipelines and ML scenarios in SAP Data Intelligence Cloud. Covers graph operators, ABAP/S4HANA integration, replication flows, and JupyterLab.", + "version": "1.0.0" + }, + "content": { + "files": [ + { + "path": "README.md", + "sha256": "56f3f7d1abeab8e01945e753cd284bfa0a5aacb8fa185ae2272c1248d795675a" + }, + { + "path": "SKILL.md", + "sha256": "7313f8ce2068eb28baec80aa4d7b88ea852d7d887aaf591af671b6cf5c754f3f" + }, + { + "path": "references/subengines.md", + "sha256": "683fefe11c103a1d5a1bb6b596e93ddf30132b462a997ee47fdc386c4077c024" + }, + { + "path": "references/data-workflow.md", + "sha256": "57196560360fba13dda7c8290d14a6b03b2d4a856945ea43506b57baaa2c3e98" + }, + { + "path": "references/operators-reference.md", + "sha256": "3a572c0bc89805ac8d21d7450c8d9563093edfd68e5a8909d963bd8d7a989e29" + }, + { + "path": "references/graphs-pipelines.md", + "sha256": "f301da0471ede6eb27a16fc2d4553e72d56defc04d4c3054a54b347aebf6f7d4" + }, + { + "path": "references/modeling-advanced.md", + "sha256": "d9bfcd3b16637bdaffcfdc03887f984dce52cdd8ced174057a17e2f508f5e82e" + }, + { + "path": "references/additional-features.md", + "sha256": "449ee0a2fedcfd7a6f46b5744a26e08f47cc63e46383dbc5c6366fa73e653207" + }, + { + "path": "references/structured-data-operators.md", + "sha256": "1eae8fb23a917a1d72c6c392c8a88a2a17d30e29809566df7fe28feb51531a47" + }, + { + "path": "references/dtl-functions.md", + "sha256": "1c07cf3f339c3efbc5ebc41de626ab98a4d37a09fd63b15b6b1ca821faed6734" + }, + { + "path": "references/abap-integration.md", + "sha256": "8a3b12564ec4920d92f606553e61df96e50ef113259192741fce642960f710b0" + }, + { + "path": "references/replication-flows.md", + "sha256": "9e614451bb347d655cd768edfb17a0f22e0e06b7cf39c1cd3d425ba17ada59df" + }, + { + "path": "references/security-cdc.md", + "sha256": "40e65d1666dd5cda77a858cb8197a5ae7810bf4bf4a5cf3f1db3f23917048eab" + }, + { + "path": "references/ml-scenario-manager.md", + "sha256": "6a73f0a7218c73b292df9a4a8b8e7d06fe0926115f018b153fa747aa9ffd8594" + }, + { + "path": ".claude-plugin/plugin.json", + "sha256": "00ed0f75239da5cca0da5e997be402c516692a2f17ec2759ea3224100a5b039b" + }, + { + "path": "templates/ml-training-pipeline.json", + "sha256": "54d4a5462c7772863360a13b20e83a9f05f7d3b6289db28f873fce490c5bf511" + }, + { + "path": "templates/basic-graph.json", + "sha256": "ee0b8ee4d89722625e5dd0162070380925fc726bb80409362a7da406b6d972ee" + }, + { + "path": "templates/replication-flow.json", + "sha256": "bcf4b9669e8258a31a74e96c66877ab302bf75a93ede1d82da39b3d74da453b5" + } + ], + "dirSha256": "b0d49a52297195782a9dc2ba0d363bf8c1712c7a7d45a3327a832dcc0553a7b7" + }, + "security": { + "scannedAt": null, + "scannerVersion": null, + "flags": [] + } +} \ No newline at end of file diff --git a/references/abap-integration.md b/references/abap-integration.md new file mode 100644 index 0000000..7ee7382 --- /dev/null +++ b/references/abap-integration.md @@ -0,0 +1,483 @@ +# ABAP Integration Guide + +Complete guide for integrating ABAP-based SAP systems with SAP Data Intelligence. + +## Table of Contents + +1. [Overview](#overview) +2. [Prerequisites](#prerequisites) +3. [Cloud Connector Setup](#cloud-connector-setup) +4. [Connection Configuration](#connection-configuration) +5. [Data Sources](#data-sources) +6. [ABAP Operators](#abap-operators) +7. [Custom ABAP Operators](#custom-abap-operators) +8. [Data Type Mapping](#data-type-mapping) +9. [Security](#security) +10. [Troubleshooting](#troubleshooting) + +--- + +## Overview + +SAP Data Intelligence Cloud integrates with ABAP-based SAP systems including: + +- SAP S/4HANA (Cloud and On-Premise) +- SAP Business Warehouse (BW/4HANA, BW) +- SAP ECC +- Other NetWeaver-based systems + +**Key Characteristics:** +- ABAP operators run on the ABAP Pipeline Engine in the source system +- Metadata and data flow through SAP Data Intelligence Cloud +- Supports real-time and batch data extraction + +**Central Reference**: SAP Note 2890171 contains essential setup information. + +--- + +## Prerequisites + +### SAP System Requirements + +**Minimum Versions:** +- SAP NetWeaver 7.50 SP00 or higher +- SAP S/4HANA 1909 or higher (recommended) +- SAP BW/4HANA 2.0 or higher + +**Required Components:** +- ABAP Pipeline Engine (installed via SAP Notes) +- Cloud Connector (for on-premise systems) + +### SAP Data Intelligence Requirements + +- SAP Data Intelligence Cloud tenant +- Connection Management access +- Appropriate authorizations + +--- + +## Cloud Connector Setup + +For on-premise ABAP systems, configure SAP Cloud Connector. + +### Installation Steps + +1. **Download Cloud Connector** from SAP Support Portal +2. **Install** on a server with network access to ABAP system +3. **Configure** connection to SAP BTP subaccount +4. **Map** internal ABAP system to virtual host + +### Configuration Example + +``` +Location ID: +Internal Host: :44300 +Virtual Host: virtualabap:44300 +Protocol: HTTPS +Principal Propagation: Enabled (optional) +``` + +### Trust Configuration + +1. Import SAP Data Intelligence CA certificate +2. Configure system certificate for backend +3. Enable principal propagation if needed + +### Resource Configuration + +Configure required resources in Cloud Connector Resources pane: + +**For CDS View Extraction:** +``` +Prefix: DHAMB_ (required) +Prefix: DHAPE_ (required) +Function: RFC_FUNCTION_SEARCH +``` + +**For SLT Replication Server:** +``` +Prefix: LTAMB_ (required) +Prefix: LTAPE_ (required) +Function: RFC_FUNCTION_SEARCH +``` + +### SNC Configuration + +If using Secure Network Communication (SNC): +- Configure SNC in Cloud Connector (not in DI Connection Management) +- Upload SNC certificates to Cloud Connector +- Map SNC names appropriately + +--- + +## Connection Configuration + +### Creating ABAP Connection + +1. **Open Connection Management** in SAP Data Intelligence +2. **Create new connection** with type "ABAP" +3. **Configure parameters**: + +**Basic Settings:** +``` +Connection Type: ABAP +Host: +Port: 44300 +Client: 100 +System ID: +``` + +**Authentication:** +- Basic Authentication: Username/Password +- Principal Propagation: SSO via Cloud Connector +- X.509 Certificate: Certificate-based + +**Cloud Connector:** +``` +Location ID: +``` + +### Testing Connection + +Use "Test Connection" to verify: +- Network connectivity +- Authentication +- Authorization + +--- + +## Data Sources + +### CDS Views + +ABAP Core Data Services views are the recommended data source. + +**SAP S/4HANA Cloud Options:** +- Standard CDS views with C1 release contract annotations +- Custom CDS views developed for Data Intelligence integration + +**SAP S/4HANA On-Premise Options:** +- Standard CDS views with C1 release contract +- Custom ABAP CDS views created using ABAP Development Tool (ADT) + +**Discovery Methods:** +- Use `I_DataExtractionEnabledView` (available in S/4HANA 2020+) to find extraction-enabled views +- Check Metadata Explorer properties: "Extraction Enabled", "Delta enabled", "CDS Type" + +**Required Annotations for Custom Views:** +```abap +@Analytics.dataExtraction.enabled: true +@Analytics.dataExtraction.delta.changeDataCapture.automatic: true +``` + +**Configuration:** +``` +CDS View: I_Product +Selection Fields: Product, ProductType +Package Size: 10000 +``` + +**Best Practices:** +- Use extraction-enabled views for delta capability +- Apply filters to reduce data volume +- Consider view performance characteristics +- Verify C1 contract compliance for standard views + +### ODP (Operational Data Provisioning) + +Framework for extracting data from various ABAP sources. + +**Prerequisites:** +- Gen1 operators: Require DMIS add-on + ODP Application API v2 +- Gen2 operators/Replication flows with S/4HANA: Only ODP Application API required +- Legacy systems: Both DMIS add-on + ODP Application API required + +**Key SAP Notes:** +- SAP Note 2890171: ODP integration requirements +- SAP Note 2775549: Integration prerequisites +- SAP Note 3100673: Technical user privileges + +**Connection Types:** + +| Type | Description | Use Case | +|------|-------------|----------| +| ABAP | Recommended connection | Latest features, resilience, replication flows | +| ABAP_LEGACY | Fallback option | When DMIS installation not feasible | + +**ODP Contexts:** +- SAPI: DataSources (classic BW extractors) +- ABAP CDS: CDS views with extraction annotations +- BW: BW InfoProviders +- SLT: SLT-replicated tables + +**Delta Support:** +- Full extraction (initial load) +- Delta extraction (CDC - Change Data Capture) +- Delta initialization (reset delta pointer) + +### Tables + +Direct table access for simple scenarios. + +**Configuration:** +``` +Table: MARA +Fields: MATNR, MTART, MATKL +Where Clause: MTART = 'FERT' +``` + +**Limitations:** +- No built-in delta capability +- May require additional authorization +- Consider performance for large tables + +### SLT (SAP Landscape Transformation) + +Real-time data replication via SLT Server. + +**Components:** +- SLT Server (on-premise) +- Mass Transfer ID configuration +- Target connection in Data Intelligence + +**Capabilities:** +- Real-time CDC +- Initial load + continuous delta +- Table-level replication + +--- + +## ABAP Operators + +### ABAP CDS Reader + +Reads from CDS views with extraction. + +**Key Parameters:** +- CDS View Name +- Selection Conditions +- Package Size (rows per batch) +- Max Rows (limit) + +### ABAP Table Reader + +Reads from ABAP tables directly. + +**Key Parameters:** +- Table Name +- Field List +- Where Clause +- Order By + +### SLT Connector + +Connects to SLT for real-time replication. + +**Key Parameters:** +- Mass Transfer ID +- Table Name +- Initial Load (yes/no) +- Subscription Type + +### Generation 1 vs Generation 2 + +**Gen1 ABAP Operators:** +- Traditional process model +- Manual recovery + +**Gen2 ABAP Operators:** +- Enhanced recovery +- State management +- Better error handling + +--- + +## Custom ABAP Operators + +### Architecture + +Custom operators run in the ABAP Pipeline Engine: + +``` +SAP Data Intelligence <-> ABAP Pipeline Engine <-> Custom Operator Code +``` + +### Creating Custom Operators + +1. **Create ABAP class** implementing interface +2. **Register** in ABAP Pipeline Engine repository +3. **Deploy** to SAP Data Intelligence +4. **Use** in graphs + +### Implementation Pattern + +```abap +CLASS zcl_custom_operator DEFINITION + PUBLIC + FINAL + CREATE PUBLIC. + + PUBLIC SECTION. + INTERFACES if_sdi_operator. + + PRIVATE SECTION. + DATA: mv_parameter TYPE string. + +ENDCLASS. + +CLASS zcl_custom_operator IMPLEMENTATION. + + METHOD if_sdi_operator~init. + " Initialize operator + mv_parameter = io_context->get_parameter( 'PARAM1' ). + ENDMETHOD. + + METHOD if_sdi_operator~start. + " Start processing + DATA: lt_data TYPE TABLE OF string. + " ... process data ... + io_context->send_output( 'OUTPUT' , lt_data ). + ENDMETHOD. + +ENDCLASS. +``` + +--- + +## Data Type Mapping + +### ABAP to Data Intelligence Type Mapping + +| ABAP Type | Data Intelligence Type | +|-----------|----------------------| +| CHAR | String | +| NUMC | String | +| DATS | Date | +| TIMS | Time | +| DEC | Decimal | +| INT1/INT2/INT4 | Integer | +| FLTP | Double | +| RAW | Binary | +| STRING | String | +| RAWSTRING | Binary | + +### Wire Format Conversion Options (Gen1) + +| Conversion Type | Description | +|-----------------|-------------| +| Enhanced format conversion | Validates data, converts invalid to "NaN" or "undefined" per ISO standards | +| Required conversions | Minimal technical changes, preserves invalid values | +| Required conversions + currency | Adds currency shift conversion | +| Required conversions + time format + currency | ISO date/time format + currency handling | + +**Gen2 and Replication Flows:** +Default: "Required Conversions Plus Time Format and Currency" (cannot be changed) + +**Detailed Mappings:** See SAP Note 3035658 for complete type conversion tables. + +### Conversion Considerations + +**Date/Time:** +- ABAP dates: YYYYMMDD format +- Initial dates (00000000) may need handling +- ISO format conversion available for Gen2 + +**Numbers:** +- NUMC fields are strings (preserve leading zeros) +- Packed decimals maintain precision +- Currency fields may be shifted based on currency table + +**Binary:** +- RAW fields convert to binary +- Consider encoding for text storage + +--- + +## Security + +### Authorization Requirements + +**SAP Data Intelligence:** +- Connection Management access +- Graph execution rights + +**ABAP System:** +- RFC authorization +- Data access authorization (S_TABU_DIS, etc.) +- CDS view authorization (@AccessControl) + +### Network Security + +- Cloud Connector encryption (TLS) +- Principal propagation for SSO +- IP restrictions + +### Data Protection + +- Apply CDS access controls +- Mask sensitive fields in extraction +- Audit logging in both systems + +--- + +## Troubleshooting + +### Common Issues + +| Issue | Cause | Solution | SAP Note | +|-------|-------|----------|----------| +| Session limit exceeded | ABAP Pipeline Engine constraints | Adjust session limits | 2999448 | +| Connection validation fails | Connection configuration | Check connection setup | 2849542 | +| CDS view extraction error (Gen1) | View not extraction-enabled | Enable extraction annotations | - | +| SLT subscription conflict (Gen1) | Subscription already in use | Release subscription | 3057246 | +| Invalid character in string | Encoding issues | Check data encoding | 3016338 | +| Object does not exist (Gen2) | Object not found | Verify object name | 3143151 | +| Connection timeout | Network/firewall | Check Cloud Connector mapping | - | +| Authentication failed | Wrong credentials | Verify user/password | - | +| Authorization error | Missing permissions | Check ABAP authorizations | - | + +### CDS View Specific Issues + +**"CDS view does not support data extraction":** +1. Verify extraction annotations exist +2. Check C1 contract compliance for standard views +3. Use `I_DataExtractionEnabledView` to verify + +### Diagnostic Steps + +1. **Test connection** in Connection Management +2. **Check Cloud Connector** logs +3. **Review ABAP** system logs (ST22, SM21) +4. **Monitor** Pipeline Engine (transaction /IWREP/MONITOR) +5. **Check** SAP Data Intelligence execution logs + +### SAP Notes Reference + +| Note | Description | +|------|-------------| +| 2890171 | Central ABAP integration note | +| 2775549 | Integration prerequisites | +| 2849542 | Connection troubleshooting | +| 2973594 | Known issues and corrections | +| 2999448 | Session limit issues | +| 3016338 | Invalid character errors | +| 3035658 | Data type conversion tables | +| 3057246 | SLT subscription conflicts | +| 3100673 | Technical user privileges | +| 3143151 | Object not found errors | + +### Resources + +- **SAP Community**: [https://community.sap.com/topics/data-intelligence](https://community.sap.com/topics/data-intelligence) +- **SAP Support Portal**: [https://support.sap.com](https://support.sap.com) + +--- + +## Documentation Links + +- **ABAP Integration Guide**: [https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/tree/main/docs/abapintegration](https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/tree/main/docs/abapintegration) +- **User Guide**: [https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/tree/main/docs/abapintegration/user-guide-for-abap-integration](https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/tree/main/docs/abapintegration/user-guide-for-abap-integration) + +--- + +**Last Updated**: 2025-11-22 diff --git a/references/additional-features.md b/references/additional-features.md new file mode 100644 index 0000000..4eb34ae --- /dev/null +++ b/references/additional-features.md @@ -0,0 +1,394 @@ +# Additional Features Guide + +Additional SAP Data Intelligence features including monitoring, cloud storage services, scenario templates, data types, and Git integration. + +## Table of Contents + +1. [Monitoring Application](#monitoring-application) +2. [Cloud Storage Services](#cloud-storage-services) +3. [Scenario Templates](#scenario-templates) +4. [Custom Data Types](#custom-data-types) +5. [Git Terminal Integration](#git-terminal-integration) +6. [Graph Snippets](#graph-snippets) + +--- + +## Monitoring Application + +SAP Data Intelligence includes a stand-alone monitoring application for operational oversight. + +### Accessing the Monitor + +**Options:** +- SAP Data Intelligence Launchpad tile +- Direct stable URL access + +### Capabilities + +| Feature | Description | +|---------|-------------| +| Graph Status | View execution status, timing, type, source | +| Scheduling | Schedule graph executions | +| Termination | Terminate running processes | +| Navigation | Open graphs directly in Modeler | +| Configuration | Review graph configurations | +| Replication Flows | Monitor flows and associated tasks | + +### Access Permissions + +**With `sap.dh.monitoring` policy:** +- View analytics and instances for all tenant users +- Does not include schedule access + +**Without policy:** +- Monitor only your own graphs + +### What's Displayed + +For each graph instance: +- Execution status (Running, Completed, Failed, Dead) +- Run timing (start, end, duration) +- Graph classification +- Source origin + +--- + +## Cloud Storage Services + +SAP Data Intelligence supports multiple cloud storage platforms. + +### Supported Services + +| Service | Description | Protocol | +|---------|-------------|----------| +| **Amazon S3** | AWS object storage | S3 API | +| **Azure Blob Storage (WASB)** | Microsoft cloud storage | WASB protocol | +| **Azure Data Lake (ADL/ADLS Gen2)** | Microsoft data lake | ADLS API | +| **Google Cloud Storage (GCS)** | Google object storage | GCS API | +| **Alibaba Cloud OSS** | Alibaba object storage | OSS API | +| **HDFS** | Hadoop distributed file system | HDFS protocol | +| **WebHDFS** | HDFS via REST API | HTTP/REST | +| **Local File System** | Local storage | File system | + +### Connection Configuration + +Each service requires specific connection parameters in Connection Management. + +**Common Parameters:** +- Connection ID +- Root path/bucket +- Authentication credentials + +**Service-Specific Examples:** + +**Amazon S3:** +``` +Connection Type: S3 +Region: us-east-1 +Access Key: +Secret Key: +Bucket: my-bucket +``` + +**Azure Blob Storage:** +``` +Connection Type: WASB +Account Name: +Account Key: +Container: my-container +``` + +**Google Cloud Storage:** +``` +Connection Type: GCS +Project ID: +Service Account Key: +Bucket: my-bucket +``` + +### Usage in Operators + +File operators use connection IDs to access storage: +- Structured File Consumer/Producer +- Binary File Consumer/Producer +- Cloud-specific operators (S3 Consumer, etc.) + +--- + +## Scenario Templates + +Pre-built graph scenarios for common use cases. + +### Finding Templates + +1. Open Modeler application +2. Navigate to Graphs tab +3. Enable "Scenario Templates" in visible categories +4. Or search for package `com.sap.scenarioTemplates` + +### Template Categories + +#### 1. ABAP with Data Lakes + +Ingest ABAP data into cloud storage. + +**Use Cases:** +- Extract ABAP tables to S3/Azure/GCS +- Replicate CDS views to data lake +- S/4HANA data extraction + +**Key Operators:** +- ABAP CDS Reader +- Read Data From SAP System +- Structured File Producer + +#### 2. Data Processing with Scripting Languages + +Manipulate data using scripts. + +**Use Cases:** +- Custom transformations with Python +- JavaScript data processing +- R statistical analysis + +**Key Operators:** +- Python3 Operator +- JavaScript Operator +- R Operator + +#### 3. ETL from Database + +Extract, transform, and load database data. + +**Use Cases:** +- Database to file storage +- Database to database transfer +- SQL-based transformations + +**Key Operators:** +- SQL Consumer +- Structured SQL Consumer +- Table Producer + +#### 4. Loading Data from Data Lake to SAP HANA + +Batch and stream data to HANA. + +**Use Cases:** +- Load files to HANA tables +- Stream data to HANA +- Data lake integration + +**Key Operators:** +- Structured File Consumer +- HANA Client +- Write HANA Table + +### Using Templates + +1. Find template in Graphs tab +2. Copy template to your workspace +3. Customize connections and parameters +4. Test with sample data +5. Deploy for production use + +--- + +## Custom Data Types + +Extend the type system with custom data types. + +### Data Type Categories + +| Category | Description | Customizable | +|----------|-------------|--------------| +| **Scalar** | Basic types (string, int, etc.) | No | +| **Structure** | Composite with named properties | Yes | +| **Table** | Column-based with keys | Yes | + +### Creating Global Data Types + +1. **Access Editor:** + - Open Modeler + - Navigate to Data Types tab + - Click plus icon + +2. **Configure Type:** + - Enter name (two+ identifiers separated by periods) + - Select type: Structure or Table + - Click OK + +3. **Define Properties:** + - Add properties with plus icon + - For structures: property name + scalar type + - For tables: property name + scalar type + optional Key flag + +4. **Save:** + - Click save icon + - Use "Save As" for variants + +### Naming Convention + +``` +namespace.typename + +Examples: +com.mycompany.CustomerRecord +com.mycompany.SalesData +``` + +### Structure Type Example + +``` +Type: com.company.Address +Properties: + - street: string + - city: string + - country: string + - postalCode: string +``` + +### Table Type Example + +``` +Type: com.company.OrderItems +Properties: + - orderId: string (Key) + - lineNumber: int64 (Key) + - productId: string + - quantity: int32 + - unitPrice: float64 +``` + +### Managing Data Types + +| Action | How | +|--------|-----| +| Edit | Double-click in tree view | +| Delete | Right-click > Delete | +| Copy | Save As with new name | + +--- + +## Git Terminal Integration + +Version control integration for SAP Data Intelligence artifacts. + +### Purpose + +Integrate file-based content with Git servers: +- Graphs +- Operators +- Dockerfiles +- Script code + +### Accessing Git Terminal + +1. Open Modeler +2. Navigate to Git Terminal option +3. Terminal opens with Git capabilities + +### Available Commands + +| Command | Function | +|---------|----------| +| `git init` | Initialize local repository | +| `git clone ` | Clone remote repository | +| `git add` | Stage changes | +| `git commit` | Commit changes | +| `git push` | Push to remote | +| `git pull` | Pull from remote | +| `git branch` | Create/list branches | +| `git merge` | Merge branches | +| `git rebase` | Rebase commits | +| `git status` | View status | +| `git log` | View history | + +### Credential Handling + +Configure Git credentials using standard Git Credential Helper: + +```bash +git config --global credential.helper store +git config --global user.name "Your Name" +git config --global user.email "your.email@company.com" +``` + +### Creating Local Repository + +```bash +cd /workspace/my-project +git init +git add . +git commit -m "Initial commit" +``` + +### Cloning Remote Repository + +```bash +git clone [https://github.com/your-org/your-repo.git](https://github.com/your-org/your-repo.git) +cd your-repo +``` + +### .gitignore Configuration + +Control what gets tracked: + +```gitignore +# Ignore all except specific files +* +!graph.json +!operator.json +!*.py +``` + +### Best Practices + +1. **Commit Often**: Small, focused commits +2. **Use Branches**: Feature branches for development +3. **Pull Before Push**: Avoid conflicts +4. **Meaningful Messages**: Descriptive commit messages +5. **Review Changes**: Check status before commit + +--- + +## Graph Snippets + +Reusable graph fragments for common patterns. + +### Creating Snippets + +1. Build working graph pattern +2. Select operators to include +3. Right-click > Save as Snippet +4. Name and describe snippet + +### Using Snippets + +1. Open Graphs tab +2. Find snippet in repository +3. Drag snippet to canvas +4. Configure connections +5. Customize parameters + +### Snippet Best Practices + +1. **Document Well**: Clear descriptions +2. **Parameterize**: Use substitution variables +3. **Test Thoroughly**: Verify before sharing +4. **Version**: Track snippet versions + +--- + +## Documentation Links + +- **Monitoring**: [https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/tree/main/docs/modelingguide/dataintelligence-monitoring](https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/tree/main/docs/modelingguide/dataintelligence-monitoring) +- **Service-Specific**: [https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/tree/main/docs/modelingguide/service-specific-information](https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/tree/main/docs/modelingguide/service-specific-information) +- **Scenario Templates**: [https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/tree/main/docs/modelingguide/using-scenario-templates](https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/tree/main/docs/modelingguide/using-scenario-templates) +- **Data Types**: [https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/tree/main/docs/modelingguide/creating-data-types](https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/tree/main/docs/modelingguide/creating-data-types) +- **Git Terminal**: [https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/tree/main/docs/modelingguide/using-git-terminal](https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/tree/main/docs/modelingguide/using-git-terminal) +- **Graph Snippets**: [https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/tree/main/docs/modelingguide/using-graph-snippets](https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/tree/main/docs/modelingguide/using-graph-snippets) + +--- + +**Last Updated**: 2025-11-22 diff --git a/references/data-workflow.md b/references/data-workflow.md new file mode 100644 index 0000000..4e35288 --- /dev/null +++ b/references/data-workflow.md @@ -0,0 +1,362 @@ +# Data Workflow Operators Guide + +Complete guide for data workflow orchestration in SAP Data Intelligence. + +## Table of Contents + +1. [Overview](#overview) +2. [Workflow Structure](#workflow-structure) +3. [Available Operators](#available-operators) +4. [Data Transfer](#data-transfer) +5. [Remote Execution](#remote-execution) +6. [Control Flow](#control-flow) +7. [Notifications](#notifications) +8. [Best Practices](#best-practices) + +--- + +## Overview + +Data Workflow operators orchestrate data processing tasks that run for a limited time and finish with either "completed" or "dead" status. + +**Key Characteristics:** +- Sequential execution via signal passing +- Operators start after receiving input signal +- Each operator has input, output, and error ports +- Unconnected output ports cause graph failure + +**Important:** Do not mix Data Workflow operators with non-Data Workflow operators in the same graph. + +--- + +## Workflow Structure + +### Required Components + +Every data workflow requires: +- **Workflow Trigger**: First operator (starts the workflow) +- **Workflow Terminator**: Last operator (ends the workflow) + +### Basic Structure + +``` +[Workflow Trigger] -> [Task Operator(s)] -> [Workflow Terminator] +``` + +### Signal Flow + +1. Workflow Trigger sends initial signal +2. Each operator waits for input signal +3. Operator executes task +4. Operator sends output signal (or error) +5. Next operator receives signal and executes +6. Workflow Terminator completes the graph + +--- + +## Available Operators + +### Core Workflow Operators + +| Operator | Purpose | +|----------|---------| +| Workflow Trigger | Initiates workflow execution | +| Workflow Terminator | Concludes workflow with status | +| Workflow Split | Duplicates signal for parallel paths | +| Workflow Merge (AND) | Combines outputs using logical AND | +| Workflow Merge (OR) | Combines outputs using logical OR | + +### Task Operators + +| Operator | Purpose | +|----------|---------| +| Data Transfer | Move data between systems | +| Data Transform | Apply data transformations | +| Pipeline | Execute DI graphs locally or remotely | +| SAP Data Services Job | Run remote Data Services jobs | +| SAP HANA Flowgraph | Execute HANA flowgraphs | +| BW Process Chain | Run BW process chains | +| Notification | Send email notifications | + +--- + +## Data Transfer + +### Purpose + +Transfer data from SAP systems to cloud storage. + +### Supported Sources + +- SAP Business Warehouse (BW) +- SAP HANA + +### Supported Targets + +- Amazon S3 +- Google Cloud Storage +- Hadoop Distributed File System (HDFS) +- SAP Vora + +### Transfer Modes + +| Mode | Description | Best For | +|------|-------------|----------| +| BW OLAP | Default BW access | Small datasets | +| Generated HANA Views | Partition-based transfer | Large datasets | +| BW ODP | Datastore extraction | Cloud/distributed storage | + +#### BW OLAP Mode + +- Default mode for BW sources +- Uses standard OLAP interface (like RSRT2) +- Single result set processing +- Cell export limitations +- **Not suitable for large-scale transfers** + +#### Generated HANA Views Mode + +**Requirements:** +- Connection via DI Connection Management +- SAP BW 4.2.0 or later +- Working HANA database connection +- SSL certificates (if required) +- Query with generated calculation view (no restricted attributes) + +**Advantage:** Transfers partitions separately, enabling large result sets and parallel processing. + +#### BW ODP Mode + +**Works with:** Datastores only + +**Supported Targets:** +- Azure Data Lake +- Google Cloud Storage +- HDFS +- Alibaba OSS +- Amazon S3 +- Semantic Data Lake +- Azure Storage Blob + +**Note:** Partition processing is sequential, not parallel. + +### Configuration + +``` +Source Connection: BW_SYSTEM +Target Connection: S3_BUCKET +Transfer Mode: Generated HANA Views +Source Query: /NAMESPACE/QUERY +Target Path: /data/export/ +``` + +--- + +## Remote Execution + +### Pipeline Operator + +Execute SAP Data Intelligence graphs. + +**Options:** +- Local execution (same DI instance) +- Remote execution (different DI instance) +- Parameter passing +- Synchronous/asynchronous + +**Configuration:** +``` +Graph: /namespace/my_pipeline +Connection: (for remote) +Parameters: key=value pairs +Wait for Completion: Yes/No +``` + +### SAP Data Services Job + +Execute jobs in remote SAP Data Services systems. + +**Prerequisites:** +- Data Services connection configured +- Job accessible from DI + +**Configuration:** +``` +Connection: DS_CONNECTION +Repository: REPO_NAME +Job: JOB_NAME +Global Variables: VAR1=VALUE1 +``` + +### SAP HANA Flowgraph + +Execute flowgraphs in remote HANA systems. + +**Prerequisites:** +- HANA connection configured +- Flowgraph deployed + +**Configuration:** +``` +Connection: HANA_CONNECTION +Flowgraph: SCHEMA.FLOWGRAPH_NAME +Parameters: (if applicable) +``` + +### BW Process Chain + +Execute SAP BW process chains. + +**Prerequisites:** +- BW connection configured +- Process chain accessible + +**Configuration:** +``` +Connection: BW_CONNECTION +Process Chain: CHAIN_ID +Variant: (if applicable) +Wait for Completion: Yes/No +``` + +--- + +## Control Flow + +### Workflow Split + +Duplicates incoming signal to multiple output ports. + +``` + ┌──→ [Task A] +[Trigger] → [Split] ┼──→ [Task B] + └──→ [Task C] +``` + +**Use Case:** Parallel execution paths + +### Workflow Merge (AND) + +Combines multiple inputs using logical AND. Sends output only when **all** inputs received. + +``` +[Task A] ──┐ +[Task B] ──┼──→ [Merge AND] → [Next Task] +[Task C] ──┘ +``` + +**Use Case:** Wait for all parallel tasks to complete + +### Workflow Merge (OR) + +Combines multiple inputs using logical OR. Sends output when **any** input received. + +``` +[Task A] ──┐ +[Task B] ──┼──→ [Merge OR] → [Next Task] +[Task C] ──┘ +``` + +**Use Case:** Continue when first task completes + +### Error Handling + +**Error Port:** +- All task operators have error ports +- Connect error port to handle failures +- Unhandled errors terminate workflow + +``` + ┌── success ──→ [Continue] +[Task] ────────────┤ + └── error ──→ [Error Handler] +``` + +--- + +## Notifications + +### Notification Operator + +Send email notifications during workflow execution. + +**Configuration:** +``` +SMTP Connection: EMAIL_CONNECTION +To: recipients@company.com +CC: (optional) +Subject: Workflow ${workflow.name} - ${status} +Body: The workflow completed at ${timestamp} +Attachment: (optional file path) +``` + +### Use Cases + +- Success notifications +- Error alerts +- Progress updates +- Audit trail + +### Template Variables + +| Variable | Description | +|----------|-------------| +| `${workflow.name}` | Workflow name | +| `${status}` | Execution status | +| `${timestamp}` | Current timestamp | +| `${error.message}` | Error details (if failed) | + +--- + +## Best Practices + +### Design Principles + +1. **Clear Flow**: Design linear or clearly branching workflows +2. **Error Handling**: Always connect error ports +3. **Notifications**: Add alerts for critical failures +4. **Idempotency**: Design tasks to be re-runnable + +### Performance + +1. **Parallelize**: Use Split/Merge for independent tasks +2. **Optimize Transfers**: Choose appropriate transfer mode +3. **Monitor Progress**: Track workflow status +4. **Resource Planning**: Consider target system load + +### Reliability + +1. **Test Components**: Validate each task individually +2. **Handle Failures**: Implement retry logic where needed +3. **Clean Up**: Manage temporary data +4. **Document**: Maintain workflow documentation + +### Example Workflow + +``` +[Trigger] + ↓ +[Split] + ├──→ [Transfer from BW] ──→ [Merge AND] + └──→ [Transfer from HANA] ─┘ + ↓ + [Transform Data] + ↓ + [Load to Target] + ↓ + [Send Notification] + ↓ + [Terminator] +``` + +--- + +## Documentation Links + +- **Data Workflow Operators**: [https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/tree/main/docs/modelingguide/working-with-dataworkflow-operators](https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/tree/main/docs/modelingguide/working-with-dataworkflow-operators) +- **Transfer Data**: [https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/blob/main/docs/modelingguide/working-with-dataworkflow-operators/transfer-data-b250a0b.md](https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/blob/main/docs/modelingguide/working-with-dataworkflow-operators/transfer-data-b250a0b.md) +- **Transfer Modes**: [https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/blob/main/docs/modelingguide/working-with-dataworkflow-operators/transfer-modes-a615280.md](https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/blob/main/docs/modelingguide/working-with-dataworkflow-operators/transfer-modes-a615280.md) + +--- + +**Last Updated**: 2025-11-22 diff --git a/references/dtl-functions.md b/references/dtl-functions.md new file mode 100644 index 0000000..1632864 --- /dev/null +++ b/references/dtl-functions.md @@ -0,0 +1,879 @@ +# Data Transformation Language (DTL) Functions Reference + +Complete reference for Data Transformation Language functions in SAP Data Intelligence. + +## Table of Contents + +1. [Overview](#overview) +2. [String Functions](#string-functions) +3. [Numeric Functions](#numeric-functions) +4. [Date and Time Functions](#date-and-time-functions) +5. [Data Type Conversion Functions](#data-type-conversion-functions) +6. [Miscellaneous Functions](#miscellaneous-functions) + +--- + +## Overview + +Data Transformation Language (DTL) provides SQL-like functions for data processing in: + +- Data Transform operator +- Data preparation tools +- Structured data operators + +**Syntax Pattern:** +``` +FUNCTION_NAME(argument1, argument2, ...) +``` + +--- + +## String Functions + +Functions for extracting, manipulating, and returning string information. + +### CONCAT + +Concatenates two or more strings. + +```sql +CONCAT(string1, string2 [, string3, ...]) +``` + +**Example:** +```sql +CONCAT('Hello', ' ', 'World') -- Returns: 'Hello World' +``` + +### SUBSTRING + +Extracts a portion of a string. + +```sql +SUBSTRING(string, start_position [, length]) +``` + +**Example:** +```sql +SUBSTRING('SAP Data Intelligence', 5, 4) -- Returns: 'Data' +``` + +### SUBSTRAFTER + +Returns substring after the first occurrence of a delimiter. + +```sql +SUBSTRAFTER(string, delimiter) +``` + +**Example:** +```sql +SUBSTRAFTER('user@domain.com', '@') -- Returns: 'domain.com' +``` + +### SUBSTRBEFORE + +Returns substring before the first occurrence of a delimiter. + +```sql +SUBSTRBEFORE(string, delimiter) +``` + +**Example:** +```sql +SUBSTRBEFORE('user@domain.com', '@') -- Returns: 'user' +``` + +### LEFT + +Returns leftmost characters of a string. + +```sql +LEFT(string, length) +``` + +**Example:** +```sql +LEFT('SAP Data', 3) -- Returns: 'SAP' +``` + +### RIGHT + +Returns rightmost characters of a string. + +```sql +RIGHT(string, length) +``` + +**Example:** +```sql +RIGHT('SAP Data', 4) -- Returns: 'Data' +``` + +### LENGTH + +Returns the length of a string. + +```sql +LENGTH(string) +``` + +**Example:** +```sql +LENGTH('SAP') -- Returns: 3 +``` + +### UPPER / UCASE + +Converts string to uppercase. + +```sql +UPPER(string) +UCASE(string) +``` + +**Example:** +```sql +UPPER('sap data') -- Returns: 'SAP DATA' +``` + +### LOWER / LCASE + +Converts string to lowercase. + +```sql +LOWER(string) +LCASE(string) +``` + +**Example:** +```sql +LOWER('SAP DATA') -- Returns: 'sap data' +``` + +### TRIM + +Removes leading and trailing spaces. + +```sql +TRIM(string) +``` + +**Example:** +```sql +TRIM(' SAP ') -- Returns: 'SAP' +``` + +### LTRIM + +Removes leading spaces. + +```sql +LTRIM(string) +``` + +### RTRIM + +Removes trailing spaces. + +```sql +RTRIM(string) +``` + +### LPAD + +Pads string on the left to specified length. + +```sql +LPAD(string, length [, pad_string]) +``` + +**Example:** +```sql +LPAD('42', 5, '0') -- Returns: '00042' +``` + +### RPAD + +Pads string on the right to specified length. + +```sql +RPAD(string, length [, pad_string]) +``` + +**Example:** +```sql +RPAD('SAP', 6, 'X') -- Returns: 'SAPXXX' +``` + +### REPLACE + +Replaces occurrences of a substring. + +```sql +REPLACE(string, search_string, replace_string) +``` + +**Example:** +```sql +REPLACE('SAP HANA', 'HANA', 'DI') -- Returns: 'SAP DI' +``` + +### LOCATE + +Returns position of substring in string. + +```sql +LOCATE(string, substring [, start_position]) +``` + +**Example:** +```sql +LOCATE('SAP Data Intelligence', 'Data') -- Returns: 5 +``` + +### ASCII + +Returns ASCII code of first character. + +```sql +ASCII(string) +``` + +**Example:** +```sql +ASCII('A') -- Returns: 65 +``` + +### CHAR + +Returns character for ASCII code. + +```sql +CHAR(integer) +``` + +**Example:** +```sql +CHAR(65) -- Returns: 'A' +``` + +--- + +## Numeric Functions + +Functions for mathematical operations on numeric data. + +### ABS + +Returns absolute value. + +```sql +ABS(number) +``` + +**Example:** +```sql +ABS(-42) -- Returns: 42 +``` + +### CEIL + +Rounds up to nearest integer. + +```sql +CEIL(number) +``` + +**Example:** +```sql +CEIL(4.2) -- Returns: 5 +``` + +### FLOOR + +Rounds down to nearest integer. + +```sql +FLOOR(number) +``` + +**Example:** +```sql +FLOOR(4.8) -- Returns: 4 +``` + +### ROUND + +Rounds to specified decimal places. + +```sql +ROUND(number [, decimal_places]) +``` + +**Example:** +```sql +ROUND(3.14159, 2) -- Returns: 3.14 +``` + +### MOD + +Returns remainder of division. + +```sql +MOD(dividend, divisor) +``` + +**Example:** +```sql +MOD(10, 3) -- Returns: 1 +``` + +### POWER + +Returns base raised to exponent. + +```sql +POWER(base, exponent) +``` + +**Example:** +```sql +POWER(2, 3) -- Returns: 8 +``` + +### SQRT + +Returns square root. + +```sql +SQRT(number) +``` + +**Example:** +```sql +SQRT(16) -- Returns: 4 +``` + +### EXP + +Returns e raised to power. + +```sql +EXP(number) +``` + +**Example:** +```sql +EXP(1) -- Returns: 2.71828... +``` + +### LN + +Returns natural logarithm. + +```sql +LN(number) +``` + +**Example:** +```sql +LN(2.71828) -- Returns: ~1 +``` + +### LOG + +Returns logarithm with specified base. + +```sql +LOG(base, number) +``` + +**Example:** +```sql +LOG(10, 100) -- Returns: 2 +``` + +### SIGN + +Returns sign of number (-1, 0, or 1). + +```sql +SIGN(number) +``` + +**Example:** +```sql +SIGN(-42) -- Returns: -1 +``` + +### UMINUS + +Returns negation of number. + +```sql +UMINUS(number) +``` + +**Example:** +```sql +UMINUS(42) -- Returns: -42 +``` + +### RAND + +Returns random number between 0 and 1. + +```sql +RAND() +``` + +--- + +## Date and Time Functions + +Functions for date and time operations. + +### CURRENT_UTCDATE + +Returns current UTC date. + +```sql +CURRENT_UTCDATE() +``` + +### CURRENT_UTCTIME + +Returns current UTC time. + +```sql +CURRENT_UTCTIME() +``` + +### CURRENT_UTCTIMESTAMP + +Returns current UTC timestamp. + +```sql +CURRENT_UTCTIMESTAMP() +``` + +### ADD_DAYS + +Adds days to a date. + +```sql +ADD_DAYS(date, days) +``` + +**Example:** +```sql +ADD_DAYS('2025-01-01', 30) -- Returns: '2025-01-31' +``` + +### ADD_MONTHS + +Adds months to a date. + +```sql +ADD_MONTHS(date, months) +``` + +**Example:** +```sql +ADD_MONTHS('2025-01-15', 2) -- Returns: '2025-03-15' +``` + +### ADD_YEARS + +Adds years to a date. + +```sql +ADD_YEARS(date, years) +``` + +### ADD_SECONDS + +Adds seconds to a timestamp. + +```sql +ADD_SECONDS(timestamp, seconds) +``` + +### DAYS_BETWEEN + +Returns days between two dates. + +```sql +DAYS_BETWEEN(date1, date2) +``` + +**Example:** +```sql +DAYS_BETWEEN('2025-01-01', '2025-01-31') -- Returns: 30 +``` + +### MONTHS_BETWEEN + +Returns months between two dates. + +```sql +MONTHS_BETWEEN(date1, date2) +``` + +### YEARS_BETWEEN + +Returns years between two dates. + +```sql +YEARS_BETWEEN(date1, date2) +``` + +### SECONDS_BETWEEN + +Returns seconds between two timestamps. + +```sql +SECONDS_BETWEEN(timestamp1, timestamp2) +``` + +### EXTRACT + +Extracts date/time component. + +```sql +EXTRACT(component FROM date_or_timestamp) +``` + +**Components:** YEAR, MONTH, DAY, HOUR, MINUTE, SECOND + +**Example:** +```sql +EXTRACT(YEAR FROM '2025-06-15') -- Returns: 2025 +``` + +### YEAR + +Returns year from date. + +```sql +YEAR(date) +``` + +### MONTH + +Returns month from date (1-12). + +```sql +MONTH(date) +``` + +### DAYOFMONTH + +Returns day of month (1-31). + +```sql +DAYOFMONTH(date) +``` + +### DAYOFYEAR + +Returns day of year (1-366). + +```sql +DAYOFYEAR(date) +``` + +### WEEK + +Returns week number (1-53). + +```sql +WEEK(date) +``` + +### ISOWEEK + +Returns ISO week number. + +```sql +ISOWEEK(date) +``` + +### QUARTER + +Returns quarter (1-4). + +```sql +QUARTER(date) +``` + +### HOUR + +Returns hour (0-23). + +```sql +HOUR(timestamp) +``` + +### MINUTE + +Returns minute (0-59). + +```sql +MINUTE(timestamp) +``` + +### SECOND + +Returns second (0-59). + +```sql +SECOND(timestamp) +``` + +### DAYNAME + +Returns day name. + +```sql +DAYNAME(date) +``` + +**Example:** +```sql +DAYNAME('2025-01-01') -- Returns: 'Wednesday' +``` + +### MONTHNAME + +Returns month name. + +```sql +MONTHNAME(date) +``` + +### LAST_DAY + +Returns last day of month. + +```sql +LAST_DAY(date) +``` + +**Example:** +```sql +LAST_DAY('2025-02-15') -- Returns: '2025-02-28' +``` + +### NEXT_DAY + +Returns next occurrence of weekday. + +```sql +NEXT_DAY(date, weekday) +``` + +--- + +## Data Type Conversion Functions + +Functions for converting between data types. + +### TO_STRING + +Converts to string. + +```sql +TO_STRING(value [, format]) +``` + +**Example:** +```sql +TO_STRING(12345) -- Returns: '12345' +TO_STRING(3.14, '0.00') -- Returns: '3.14' +``` + +### TO_INTEGER + +Converts to integer. + +```sql +TO_INTEGER(value) +``` + +**Example:** +```sql +TO_INTEGER('42') -- Returns: 42 +TO_INTEGER(3.7) -- Returns: 3 +``` + +### TO_DECIMAL + +Converts to decimal. + +```sql +TO_DECIMAL(value [, precision, scale]) +``` + +**Example:** +```sql +TO_DECIMAL('123.45', 10, 2) -- Returns: 123.45 +``` + +### TO_FLOATING + +Converts to floating point. + +```sql +TO_FLOATING(value) +``` + +### TO_DATE + +Converts to date. + +```sql +TO_DATE(string [, format]) +``` + +**Example:** +```sql +TO_DATE('2025-01-15', 'YYYY-MM-DD') +TO_DATE('15/01/2025', 'DD/MM/YYYY') +``` + +### TO_TIME + +Converts to time. + +```sql +TO_TIME(string [, format]) +``` + +**Example:** +```sql +TO_TIME('14:30:00', 'HH24:MI:SS') +``` + +### TO_DATETIME + +Converts to datetime/timestamp. + +```sql +TO_DATETIME(string [, format]) +``` + +--- + +## Miscellaneous Functions + +### CASE + +Conditional expression. + +```sql +CASE + WHEN condition1 THEN result1 + WHEN condition2 THEN result2 + ELSE default_result +END +``` + +**Example:** +```sql +CASE + WHEN status = 'A' THEN 'Active' + WHEN status = 'I' THEN 'Inactive' + ELSE 'Unknown' +END +``` + +### COALESCE + +Returns first non-null value. + +```sql +COALESCE(value1, value2, ...) +``` + +**Example:** +```sql +COALESCE(phone, mobile, 'No number') -- Returns first non-null +``` + +### IFNULL + +Returns second value if first is null. + +```sql +IFNULL(value, replacement) +``` + +**Example:** +```sql +IFNULL(discount, 0) -- Returns 0 if discount is null +``` + +### NULLIF + +Returns null if values are equal. + +```sql +NULLIF(value1, value2) +``` + +**Example:** +```sql +NULLIF(quantity, 0) -- Returns null if quantity is 0 +``` + +### GREATEST + +Returns largest value. + +```sql +GREATEST(value1, value2, ...) +``` + +**Example:** +```sql +GREATEST(10, 20, 15) -- Returns: 20 +``` + +### LEAST + +Returns smallest value. + +```sql +LEAST(value1, value2, ...) +``` + +**Example:** +```sql +LEAST(10, 20, 15) -- Returns: 10 +``` + +### MAP + +Maps value to another value. + +```sql +MAP(input, value1, result1 [, value2, result2, ...] [, default]) +``` + +**Example:** +```sql +MAP(status, 'A', 'Active', 'I', 'Inactive', 'Unknown') +``` + +--- + +## Documentation Links + +- **Function Reference**: [https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/tree/main/docs/functionreference](https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/tree/main/docs/functionreference) +- **DTL Functions**: [https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/tree/main/docs/functionreference/function-reference-for-data-transformation-language](https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/tree/main/docs/functionreference/function-reference-for-data-transformation-language) + +--- + +**Last Updated**: 2025-11-22 diff --git a/references/graphs-pipelines.md b/references/graphs-pipelines.md new file mode 100644 index 0000000..6f3bb53 --- /dev/null +++ b/references/graphs-pipelines.md @@ -0,0 +1,494 @@ +# Graphs and Pipelines Guide + +Complete guide for graph/pipeline development in SAP Data Intelligence. + +## Table of Contents + +1. [Overview](#overview) +2. [Graph Concepts](#graph-concepts) +3. [Creating Graphs](#creating-graphs) +4. [Running Graphs](#running-graphs) +5. [Monitoring](#monitoring) +6. [Error Recovery](#error-recovery) +7. [Scheduling](#scheduling) +8. [Advanced Topics](#advanced-topics) +9. [Best Practices](#best-practices) + +--- + +## Overview + +Graphs (also called pipelines) are the core execution unit in SAP Data Intelligence. + +**Definition:** +A graph is a network of operators connected via typed input/output ports for data transfer. + +**Key Features:** +- Visual design in Modeler +- Two generations (Gen1, Gen2) +- Execution monitoring +- Error recovery (Gen2) +- Scheduling + +--- + +## Graph Concepts + +### Operator Generations + +**Gen1 Operators:** +- Legacy operator model +- Process-based execution +- Manual error handling +- Broad compatibility + +**Gen2 Operators:** +- Enhanced error recovery +- State management with snapshots +- Native multiplexing +- Better performance + +**Critical Rule:** Cannot mix Gen1 and Gen2 operators in the same graph. + +### Ports and Data Types + +**Port Types:** +- Input ports: Receive data +- Output ports: Send data +- Typed connections + +**Common Data Types:** +- string, int32, int64, float32, float64 +- blob (binary data) +- message (structured data) +- table (tabular data) +- any (flexible type) + +### Graph Structure + +``` +[Source Operator] ─── port ──→ [Processing Operator] ─── port ──→ [Target Operator] + │ │ │ + Output Port In/Out Ports Input Port +``` + +--- + +## Creating Graphs + +### Using the Modeler + +1. **Create New Graph** + - Open Modeler + - Click "+" to create graph + - Select graph name and location + +2. **Add Operators** + - Browse operator repository + - Drag operators to canvas + - Or search by name + +3. **Connect Operators** + - Drag from output port to input port + - Verify type compatibility + - Configure connection properties + +4. **Configure Operators** + - Select operator + - Set parameters in Properties panel + - Configure ports if needed + +5. **Validate Graph** + - Click Validate button + - Review warnings and errors + - Fix issues before running + +### Graph-Level Configuration + +**Graph Data Types:** +Create custom data types for the graph: + +```json +{ + "name": "CustomerRecord", + "properties": { + "id": "string", + "name": "string", + "amount": "float64" + } +} +``` + +**Graph Parameters:** +Define runtime parameters: + +``` +Parameters: + - name: source_path + type: string + default: /data/input/ + - name: batch_size + type: int32 + default: 1000 +``` + +### Groups and Tags + +**Groups:** +- Organize operators visually +- Share Docker configuration +- Resource allocation + +**Tags:** +- Label operators +- Filter and search +- Documentation + +--- + +## Running Graphs + +### Execution Methods + +**Manual Execution:** +1. Open graph in Modeler +2. Click "Run" button +3. Configure runtime parameters +4. Monitor execution + +**Programmatic Execution:** +Via Pipeline API or Data Workflow operators. + +### Execution Model + +**Process Model:** +- Each operator runs as process +- Communication via ports +- Coordinated by main engine + +**Gen2 Features:** +- Snapshot checkpoints +- State recovery +- Exactly-once semantics (configurable) + +### Runtime Parameters + +Pass parameters at execution: + +``` +source_path = /data/2024/january/ +batch_size = 5000 +target_table = SALES_JAN_2024 +``` + +### Resource Configuration + +**Memory/CPU:** +```json +{ + "resources": { + "requests": { + "memory": "1Gi", + "cpu": "500m" + }, + "limits": { + "memory": "4Gi", + "cpu": "2000m" + } + } +} +``` + +--- + +## Monitoring + +### Graph Status + +| Status | Description | +|--------|-------------| +| Pending | Waiting to start | +| Running | Actively executing | +| Completed | Finished successfully | +| Failed | Error occurred | +| Dead | Terminated unexpectedly | +| Stopping | Shutdown in progress | + +### Operator Status + +| Status | Description | +|--------|-------------| +| Initializing | Setting up | +| Running | Processing data | +| Stopped | Finished or stopped | +| Failed | Error in operator | + +### Monitoring Dashboard + +**Available Metrics:** +- Messages processed +- Processing time +- Memory usage +- Error counts + +**Access:** +1. Open running graph +2. Click "Monitor" tab +3. View real-time statistics + +### Diagnostic Information + +**Collect Diagnostics:** +1. Select running/failed graph +2. Click "Download Diagnostics" +3. Review logs and state + +**Archive Contents:** +- execution.json (execution details) +- graphs.json (graph definition) +- events.json (execution events) +- Operator logs +- State snapshots + +--- + +## Error Recovery + +### Gen2 Error Recovery + +**Automatic Recovery:** +1. Enable in graph settings +2. Configure snapshot interval +3. System recovers from last snapshot + +**Configuration:** +```json +{ + "autoRecovery": { + "enabled": true, + "snapshotInterval": "60s", + "maxRetries": 3 + } +} +``` + +### Snapshots + +**What's Saved:** +- Operator state +- Message queues +- Processing position + +**When Snapshots Occur:** +- Periodic (configured interval) +- On operator request +- Before shutdown + +### Delivery Guarantees + +| Mode | Description | +|------|-------------| +| At-most-once | May lose messages | +| At-least-once | May duplicate messages | +| Exactly-once | No loss or duplication | + +**Gen2 Default:** At-least-once with recovery. + +### Manual Error Handling + +**In Script Operators:** +```python +def on_input(msg_id, header, body): + try: + result = process(body) + api.send("output", api.Message(result)) + except Exception as e: + api.logger.error(f"Processing error: {e}") + api.send("error", api.Message({ + "error": str(e), + "input": body + })) +``` + +--- + +## Scheduling + +### Schedule Graph Executions + +**Cron Expression Format:** +``` +┌───────────── second (0-59) +│ ┌───────────── minute (0-59) +│ │ ┌───────────── hour (0-23) +│ │ │ ┌───────────── day of month (1-31) +│ │ │ │ ┌───────────── month (1-12) +│ │ │ │ │ ┌───────────── day of week (0-6, Sun=0) +│ │ │ │ │ │ +* * * * * * +``` + +**Examples:** +``` +0 0 * * * * # Every hour +0 0 0 * * * # Daily at midnight +0 0 0 * * 1 # Every Monday +0 0 6 1 * * # 6 AM on first of month +0 */15 * * * * # Every 15 minutes +``` + +### Creating Schedule + +1. Open graph +2. Click "Schedule" +3. Configure cron expression +4. Set timezone +5. Activate schedule + +### Managing Schedules + +**Actions:** +- View scheduled runs +- Pause schedule +- Resume schedule +- Delete schedule +- View execution history + +--- + +## Advanced Topics + +### Native Multiplexing (Gen2) + +Connect one output to multiple inputs: + +``` +[Source] ─┬──→ [Processor A] + ├──→ [Processor B] + └──→ [Processor C] +``` + +Or multiple outputs to one input: + +``` +[Source A] ──┐ +[Source B] ──┼──→ [Processor] +[Source C] ──┘ +``` + +### Graph Snippets + +Reusable graph fragments: + +1. **Create Snippet:** + - Select operators + - Right-click > "Save as Snippet" + - Name and save + +2. **Use Snippet:** + - Drag snippet to canvas + - Configure parameters + - Connect to graph + +### Parameterization + +**Substitution Variables:** +``` +${parameter_name} +${ENV.VARIABLE_NAME} +${SYSTEM.TENANT} +``` + +**In Operator Config:** +``` +File Path: ${source_path}/data_${DATE}.csv +Connection: ${target_connection} +``` + +### Import/Export + +**Export Graph:** +``` +1. Select graph +2. Right-click > Export +3. Include data types (optional) +4. Save as .zip +``` + +**Import Graph:** +``` +1. Right-click in repository +2. Import > From file +3. Select .zip file +4. Map dependencies +``` + +--- + +## Best Practices + +### Graph Design + +1. **Clear Data Flow**: Left-to-right, top-to-bottom +2. **Meaningful Names**: Descriptive operator names +3. **Group Related Operators**: Use groups for organization +4. **Document**: Add descriptions to operators +5. **Validate Often**: Check during development + +### Performance + +1. **Minimize Cross-Engine Communication** +2. **Use Appropriate Batch Sizes** +3. **Configure Resources**: Memory and CPU +4. **Enable Parallel Processing**: Where applicable +5. **Monitor and Tune**: Use metrics + +### Error Handling + +1. **Enable Auto-Recovery** (Gen2) +2. **Configure Appropriate Snapshot Interval** +3. **Implement Error Ports**: Route errors +4. **Log Sufficiently**: Debug information +5. **Test Failure Scenarios**: Validate recovery + +### Maintenance + +1. **Version Control**: Use graph versioning +2. **Document Changes**: Change history +3. **Test Before Deploy**: Validate thoroughly +4. **Monitor Production**: Watch for issues +5. **Clean Up**: Remove unused graphs + +--- + +## Troubleshooting + +### Common Issues + +| Issue | Cause | Solution | +|-------|-------|----------| +| Port type mismatch | Incompatible types | Use converter operator | +| Graph won't start | Resource constraints | Adjust resource config | +| Slow performance | Cross-engine overhead | Optimize operator placement | +| Recovery fails | Corrupt snapshot | Clear state, restart | +| Schedule not running | Incorrect cron | Verify expression | + +### Diagnostic Steps + +1. Check graph status +2. Review operator logs +3. Download diagnostics +4. Check resource usage +5. Verify connections + +--- + +## Documentation Links + +- **Using Graphs**: [https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/tree/main/docs/modelingguide/using-graphs](https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/tree/main/docs/modelingguide/using-graphs) +- **Creating Graphs**: [https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/tree/main/docs/modelingguide/using-graphs/creating-graphs](https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/tree/main/docs/modelingguide/using-graphs/creating-graphs) +- **Graph Examples**: [https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/tree/main/docs/repositoryobjects/data-intelligence-graphs](https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/tree/main/docs/repositoryobjects/data-intelligence-graphs) + +--- + +**Last Updated**: 2025-11-22 diff --git a/references/ml-scenario-manager.md b/references/ml-scenario-manager.md new file mode 100644 index 0000000..c8dd38a --- /dev/null +++ b/references/ml-scenario-manager.md @@ -0,0 +1,582 @@ +# ML Scenario Manager Guide + +Complete guide for machine learning in SAP Data Intelligence. + +## Table of Contents + +1. [Overview](#overview) +2. [ML Scenario Manager](#ml-scenario-manager) +3. [JupyterLab Environment](#jupyterlab-environment) +4. [Python SDK](#python-sdk) +5. [Training Pipelines](#training-pipelines) +6. [Metrics Explorer](#metrics-explorer) +7. [Model Deployment](#model-deployment) +8. [Versioning](#versioning) +9. [Best Practices](#best-practices) + +--- + +## Overview + +SAP Data Intelligence provides comprehensive machine learning capabilities: + +**Key Components:** +- **ML Scenario Manager**: Organize and manage ML artifacts +- **JupyterLab**: Interactive data science environment +- **Python SDK**: Programmatic ML operations +- **Metrics Explorer**: Visualize and compare results +- **Pipelines**: Productionize ML workflows + +--- + +## ML Scenario Manager + +Central application for organizing data science artifacts. + +### Accessing ML Scenario Manager + +1. Open SAP Data Intelligence Launchpad +2. Navigate to ML Scenario Manager tile +3. View existing scenarios or create new + +### Core Concepts + +**ML Scenario:** +- Container for datasets, notebooks, pipelines +- Supports versioning and branching +- Export/import for migration + +**Artifacts:** +- Datasets (registered data sources) +- Jupyter notebooks +- Pipelines (training, inference) +- Model files + +### Creating a Scenario + +1. Click "Create" in ML Scenario Manager +2. Enter scenario name and description +3. Choose initial version name +4. Add artifacts (datasets, notebooks, pipelines) + +### Scenario Structure + +``` +ML Scenario: Customer Churn Prediction +├── Datasets +│ ├── customer_data (registered) +│ └── transaction_history (registered) +├── Notebooks +│ ├── 01_data_exploration.ipynb +│ ├── 02_feature_engineering.ipynb +│ └── 03_model_training.ipynb +├── Pipelines +│ ├── training_pipeline +│ └── inference_pipeline +└── Versions + ├── v1.0 (initial) + ├── v1.1 (improved features) + └── v2.0 (new model architecture) +``` + +--- + +## JupyterLab Environment + +Interactive environment for data science experimentation. + +### Accessing JupyterLab + +1. From ML Scenario Manager, click "Open Notebook" +2. Or access directly from SAP Data Intelligence Launchpad + +### Available Kernels + +- Python 3 (with ML libraries) +- Custom kernels (via Docker configuration) + +### Pre-installed Libraries + +```python +# Data Processing +import pandas as pd +import numpy as np + +# Machine Learning +from sklearn.ensemble import RandomForestClassifier +from sklearn.model_selection import train_test_split + +# Deep Learning (available) +import tensorflow as tf +import torch + +# SAP Data Intelligence SDK +from sapdi import tracking +``` + +### Data Lake Access + +Access SAP Data Intelligence Data Lake from notebooks: + +```python +from sapdi.datalake import DataLakeClient + +client = DataLakeClient() + +# Read file +df = client.read_csv('/shared/data/customers.csv') + +# Write file +client.write_parquet(df, '/shared/output/processed.parquet') +``` + +### Virtual Environments + +Create isolated environments for dependencies: + +```bash +# Create virtual environment +python -m venv /home/user/myenv + +# Activate +source /home/user/myenv/bin/activate + +# Install packages +pip install xgboost lightgbm catboost +``` + +### Data Browser Extension + +Use the Data Browser to: +- Browse available data sources +- Preview data +- Import data to notebooks + +--- + +## Python SDK + +Programmatic interface for ML operations. + +### SDK Installation + +Pre-installed in JupyterLab and Python operators. + +```python +import sapdi +from sapdi import tracking +from sapdi import context +``` + +### MLTrackingSDK Functions + +| Function | Description | Limits | +|----------|-------------|--------| +| `start_run()` | Begin experiment tracking | Specify run_collection_name, run_name | +| `end_run()` | Complete tracking | Auto-adds start/end timestamps | +| `log_param()` | Log configuration values | name: 256 chars, value: 5000 chars | +| `log_metric()` | Log numeric metric | name: 256 chars (case-sensitive) | +| `log_metrics()` | Batch log metrics | Dictionary list format | +| `persist_run()` | Force save to storage | Auto at 1.5MB cache or end_run | +| `set_tags()` | Key-value pairs for filtering | runName is reserved | +| `set_labels()` | UI/semantic labels | Non-filterable | +| `delete_runs()` | Remove persisted metrics | By scenario/pipeline/execution | +| `get_runs()` | Retrieve run objects | Returns metrics, params, tags | +| `get_metrics_history()` | Get metric values | Max 1000 per metric | +| `update_run_info()` | Modify run metadata | Change name, collection, tags | + +### Metrics Tracking + +```python +from sapdi import tracking + +# Initialize tracking +with tracking.start_run(run_name="experiment_001") as run: + # Train model + model = train_model(X_train, y_train) + + # Log parameters + run.log_param("algorithm", "RandomForest") + run.log_param("n_estimators", 100) + run.log_param("max_depth", 10) + + # Log metrics + accuracy = evaluate(model, X_test, y_test) + run.log_metric("accuracy", accuracy) + run.log_metric("f1_score", f1) + + # Log model artifact + run.log_artifact("model.pkl", model) +``` + +### Tracking Parameters and Metrics + +**Parameters** (static values): +```python +run.log_param("learning_rate", 0.01) +run.log_param("batch_size", 32) +run.log_param("epochs", 100) +``` + +**Metrics** (can be logged multiple times): +```python +for epoch in range(epochs): + loss = train_epoch(model, data) + run.log_metric("loss", loss, step=epoch) + run.log_metric("val_loss", val_loss, step=epoch) +``` + +### Artifact Management + +```python +# Log files +run.log_artifact("model.pkl", model_bytes) +run.log_artifact("feature_importance.png", image_bytes) + +# Log directories +run.log_artifacts("./model_output/") + +# Retrieve artifacts +artifacts = tracking.get_run_artifacts(run_id) +model_data = artifacts.get("model.pkl") +``` + +### Artifact Class Methods + +| Method | Description | +|--------|-------------| +| `add_file()` | Add file to artifact, returns handler | +| `create()` | Create artifact with initial content, returns ID | +| `delete()` | Remove artifact metadata (not content) | +| `delete_content()` | Remove stored data | +| `download()` | Retrieve artifact contents to local storage | +| `get()` | Get artifact metadata | +| `list()` | List all artifacts in scenario | +| `open_file()` | Get handler for remote file access | +| `upload()` | Add files/directories to artifact | +| `walk()` | Depth-first traversal of artifact structure | + +### FileHandler Methods + +| Method | Description | +|--------|-------------| +| `get_reader()` | Returns file-like object for reading (use with `with`) | +| `get_writer()` | Returns object for incremental writing | +| `read()` | Load entire remote file at once | +| `write()` | Write strings, bytes, or files to data lake | + +**Important:** Files between 5 MB and 5 GB (inclusive) may be appended using the append functionality. For files smaller than 5 MB, use `get_writer()` for incremental writing instead. + +```python +from sapdi.artifact import Artifact + +# Create artifact +artifact_id = Artifact.create( + name="my_model", + description="Trained model", + content=model_bytes +) + +# List artifacts +artifacts = Artifact.list() + +# Download artifact +Artifact.download(artifact_id, local_path="/tmp/model/") + +# Read remote file +with Artifact.open_file(artifact_id, "model.pkl").get_reader() as f: + model = pickle.load(f) +``` + +### Context Information + +```python +from sapdi import context + +# Get scenario information +scenario_id = context.get_scenario_id() +version_id = context.get_version_id() + +# Get environment info +tenant = context.get_tenant() +user = context.get_user() +``` + +--- + +## Training Pipelines + +Productionize ML training workflows. + +### Pipeline Components + +``` +[Data Consumer] -> [Feature Engineering] -> [Model Training] -> [Metrics Logger] + | | | | + Read data Transform data Train model Log results +``` + +### Creating Training Pipeline + +1. Create new graph in Modeler +2. Add data consumer operator +3. Add Python operator for training +4. Add Submit Metrics operator +5. Connect and configure + +### Python Training Operator + +```python +def on_input(msg): + import pandas as pd + from sklearn.ensemble import RandomForestClassifier + from sklearn.model_selection import train_test_split + from sapdi import tracking + + # Get data + df = pd.DataFrame(msg.body) + + # Prepare features + X = df.drop('target', axis=1) + y = df['target'] + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) + + # Train model + model = RandomForestClassifier(n_estimators=100) + model.fit(X_train, y_train) + + # Evaluate + accuracy = model.score(X_test, y_test) + + # Track metrics + with tracking.start_run( + run_collection_name="classification_experiments", + run_name="rf_training_001" + ) as run: + run.log_param("model_type", "RandomForest") + run.log_metric("accuracy", accuracy) + run.log_artifact("model.pkl", pickle.dumps(model)) + + api.send("output", api.Message({"accuracy": accuracy})) + +api.set_port_callback("input", on_input) +``` + +### ML Pipeline Templates + +Pre-built templates available: + +- **Auto ML Training**: Automated model selection +- **HANA ML Training**: In-database training +- **TensorFlow Training**: Deep learning +- **Basic Training**: Generic template + +--- + +## Metrics Explorer + +Visualize and compare ML experiments. + +### Accessing Metrics Explorer + +1. Open ML Scenario Manager +2. Click "Metrics Explorer" +3. Select scenario and version + +### Viewing Runs + +**Run List:** +- Run ID and name +- Status (completed, failed, running) +- Start/end time +- Logged metrics summary + +### Comparing Runs + +1. Select multiple runs +2. Click "Compare" +3. View side-by-side metrics +4. Visualize metric trends + +### Metric Visualizations + +**Available Charts:** +- Line charts (metrics over steps) +- Bar charts (metric comparison) +- Scatter plots (parameter vs metric) + +### Filtering and Search + +``` +Filter by: +- Date range +- Status +- Parameter values +- Metric thresholds +``` + +--- + +## Model Deployment + +Deploy trained models for inference. + +### Deployment Options + +**Batch Inference:** +- Scheduled pipeline execution +- Process large datasets +- Results to storage/database + +**Real-time Inference:** +- API endpoint deployment +- Low-latency predictions +- Auto-scaling + +### Creating Inference Pipeline + +``` +[API Input] -> [Load Model] -> [Predict] -> [API Output] +``` + +### Python Inference Operator + +```python +import pickle +from sapdi.artifact import Artifact + +# Load model once (thread-safe if model object is immutable/read-only during inference) +# Note: model.predict() must be thread-safe for concurrent requests +model = None + +def load_model(): + global model + # Get artifact metadata first + artifacts = Artifact.list() + model_artifact = next((a for a in artifacts if a.name == "model"), None) + + if model_artifact: + # Download artifact and load model + with Artifact.open_file(model_artifact.id, "model.pkl").get_reader() as f: + model = pickle.load(f) + +def on_input(msg): + if model is None: + load_model() + + # Get input features + features = msg.body + + # Predict + prediction = model.predict([features])[0] + probability = model.predict_proba([features])[0] + + result = { + "prediction": int(prediction), + "probability": probability.tolist() + } + + api.send("output", api.Message(result)) + +api.set_port_callback("input", on_input) +``` + +### Deployment Monitoring + +Track deployed model performance: + +```python +# Log inference metrics +run.log_metric("inference_latency", latency_ms) +run.log_metric("prediction_count", count) +run.log_metric("error_rate", errors / total) +``` + +--- + +## Versioning + +Manage ML scenario versions. + +### Creating Versions + +1. Open ML Scenario Manager +2. Navigate to scenario +3. Click "Create Version" +4. Enter version name +5. Select base version (optional) + +### Version Workflow + +``` +v1.0 (initial baseline) + └── v1.1 (feature improvements) + └── v1.2 (hyperparameter tuning) + └── v2.0 (new architecture) + └── v2.1 (production release) +``` + +### Branching + +Create versions from any point: + +``` +v1.0 ─── v1.1 ─── v1.2 + └── v1.1-experiment (branch for testing) +``` + +### Export and Import + +**Export:** +1. Select scenario version +2. Click "Export" +3. Download ZIP file + +**Import:** +1. Click "Import" in ML Scenario Manager +2. Upload ZIP file +3. Configure target location + +--- + +## Best Practices + +### Experiment Management + +1. **Name Runs Descriptively**: Include key parameters +2. **Log Comprehensively**: All parameters and metrics +3. **Version Data**: Track data versions with runs +4. **Document Experiments**: Notes in notebooks + +### Pipeline Development + +1. **Start in Notebooks**: Prototype in JupyterLab +2. **Modularize Code**: Reusable functions +3. **Test Incrementally**: Validate each component +4. **Productionize Gradually**: Notebook to pipeline + +### Model Management + +1. **Version Models**: Link to training runs +2. **Validate Before Deploy**: Test on holdout data +3. **Monitor Production**: Track drift and performance +4. **Maintain Lineage**: Data to model to prediction + +### Resource Management + +1. **Right-size Resources**: Appropriate memory/CPU +2. **Clean Up Artifacts**: Remove unused experiments +3. **Archive Old Versions**: Export for long-term storage + +--- + +## Documentation Links + +- **Machine Learning**: [https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/tree/main/docs/machinelearning](https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/tree/main/docs/machinelearning) +- **ML Scenario Manager**: [https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/tree/main/docs/machinelearning/ml-scenario-manager](https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/tree/main/docs/machinelearning/ml-scenario-manager) +- **JupyterLab**: [https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/tree/main/docs/machinelearning/jupyterlab-environment](https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/tree/main/docs/machinelearning/jupyterlab-environment) +- **Python SDK**: [https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/tree/main/docs/machinelearning](https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/tree/main/docs/machinelearning) (see python-sdk documentation) + +--- + +**Last Updated**: 2025-11-22 diff --git a/references/modeling-advanced.md b/references/modeling-advanced.md new file mode 100644 index 0000000..c331fd1 --- /dev/null +++ b/references/modeling-advanced.md @@ -0,0 +1,387 @@ +# SAP Data Intelligence - Advanced Modeling Reference + +This reference covers advanced modeling topics including graph snippets, SAP cloud application integration, configuration types, local data types, and example graph templates. + +**Documentation Source**: [https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/tree/main/docs/modelingguide](https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/tree/main/docs/modelingguide) + +## Table of Contents + +1. [Graph Snippets](#graph-snippets) + - [What Are Graph Snippets?](#what-are-graph-snippets) + - [Creating Graph Snippets](#creating-graph-snippets) + - [Importing Graph Snippets](#importing-graph-snippets) + - [Editing Graph Snippets](#editing-graph-snippets) +2. [SAP Cloud Applications Integration](#sap-cloud-applications-integration) + - [Supported SAP Cloud Applications](#supported-sap-cloud-applications) + - [Cloud Data Integration API](#cloud-data-integration-api) + - [Connection Setup](#connection-setup) + - [Metadata Explorer Features](#metadata-explorer-features) + - [Cloud Data Integration Operator](#cloud-data-integration-operator) +3. [Configuration Types](#configuration-types) + - [Purpose](#purpose) + - [Creating Configuration Types](#creating-configuration-types) + - [Supported Data Types](#supported-data-types) + - [Value Helpers](#value-helpers) + - [Property Options](#property-options) + - [Naming Convention](#naming-convention) +4. [Local Data Types](#local-data-types) + - [Characteristics](#characteristics) + - [Type Categories](#type-categories) + - [Creating Local Data Types](#creating-local-data-types) + - [Scalar Type Templates](#scalar-type-templates) +5. [Operator Metrics](#operator-metrics) + - [Consumer Operator Metrics](#consumer-operator-metrics) + - [Producer Operator Metrics](#producer-operator-metrics) + - [Debug Mode Metrics](#debug-mode-metrics) +6. [Example Graph Templates (141 Available)](#example-graph-templates-141-available) + - [Data Integration & ETL](#data-integration--etl) + - [Machine Learning Templates](#machine-learning-templates) + - [Streaming & Real-Time](#streaming--real-time) + - [Cloud & API Integration](#cloud--api-integration) + - [Data Quality & Transformation](#data-quality--transformation) + - [Script & Development](#script--development) + +--- + +## Graph Snippets + +Graph snippets are reusable entities that group operators and connections together, performing a single logical function. + +### What Are Graph Snippets? + +- Pre-built entities containing a group of operators and connections +- Perform a single logical function as a unit +- Allow adding multiple related operators at once +- Reduce setup time and ensure consistency + +### Creating Graph Snippets + +**Steps:** + +1. Launch SAP Data Intelligence Modeler → Graphs tab +2. Search or create a new graph +3. Select portion of graph (Shift+drag or Ctrl+A for entire graph) +4. Right-click → "Create Snippet" +5. Configure operator properties: + - Mark properties as configurable during import, OR + - Preconfigure with specific values +6. Add descriptions (optional) to parameters +7. Define reusable parameters with "Add Parameter" option +8. Review settings with "Show Parameters" +9. Click "Create" +10. Save with metadata in Save Snippet dialog + +**Requirements:** +- ABAP operators require connection and version selection before snippet creation +- Use fully qualified paths when saving + +### Importing Graph Snippets + +**Steps:** + +1. Open SAP Data Intelligence Modeler → Graphs tab +2. Find or create target graph +3. Right-click in editor workspace OR use toolbar icon +4. Select snippet from Import Snippet dialog +5. Configure settings in dialog +6. Complete import +7. Adjust individual operators as needed in configuration panel + +### Editing Graph Snippets + +After import, snippets become editable components: +- Configure each operator individually +- Modify connections between operators +- Adjust port configurations +- Update parameter values + +**Reference**: Repository Objects Reference for individual snippet documentation + +--- + +## SAP Cloud Applications Integration + +SAP Data Intelligence integrates with SAP cloud applications through the Cloud Data Integration API. + +### Supported SAP Cloud Applications + +| Application | Integration Type | +|-------------|------------------| +| SAP Fieldglass | Cloud Data Integration API | +| SAP Sales Cloud | Cloud Data Integration API | +| SAP Service Cloud | Cloud Data Integration API | + +### Cloud Data Integration API + +The API adheres to OData V4 specifications and provides two service types: + +**1. Administrative Service (One per system)** +- Provides catalog of providers organized by namespaces +- Each provider corresponds to a business object +- Contains related entity sets + +**2. Provider Service (One per data provider)** +- Provides access to metadata +- Enables access to entity set data +- Based on provider structure + +### Connection Setup + +**Connection Type**: `CLOUD_DATA_INTEGRATION` + +**Configuration Steps:** +1. Open Connection Management in SAP Data Intelligence +2. Create new connection with type `CLOUD_DATA_INTEGRATION` +3. Configure service endpoint +4. Test connection + +### Metadata Explorer Features + +- Entity sets function as datasets +- Namespaces and providers map to folder structures +- View and preview entity set metadata +- Browse available data objects + +### Cloud Data Integration Operator + +**Purpose**: Data ingestion in SAP Data Intelligence Modeler + +**Capabilities:** +- Streaming replication scenarios +- Uses Flowagent technology for execution +- Supports incremental data loads +- OData V4 compliant operations + +--- + +## Configuration Types + +Configuration types are JSON files that define properties and bind them with data types for operator configurations. + +### Purpose + +- Define operator configurations +- Specify parameters within operator configuration definitions +- Create reusable type definitions +- Establish UI behavior and validation rules + +### Creating Configuration Types + +**Steps:** + +1. Access Configuration Types tab in navigation pane +2. Click plus icon to add new type +3. Add optional description +4. Add properties with: + - Name + - Display name + - Data type + +### Supported Data Types + +| Type | Configuration | +|------|---------------| +| String | Format specifications, value helpers | +| Number | Numeric constraints | +| Boolean | True/false values | +| Integer | Whole number constraints | +| Object | Drill down into schema definitions | +| Array | Specify item types | +| Custom Type | Reference other configuration types | + +### Value Helpers + +Two types of value helpers available: + +1. **Pre-defined Lists**: Static list of options +2. **REST API Sources**: Dynamic values from API calls + +### Property Options + +| Option | Description | +|--------|-------------| +| Required | Mandates user input | +| ReadOnly | Prevents edits | +| Visible | Always shown | +| Hidden | Never shown | +| Conditional | Shown based on conditions | + +### Naming Convention + +Save with fully qualified paths: +``` +com.sap.others. +``` + +--- + +## Local Data Types + +Local data types are bound to a specific graph and visible only within that graph. + +### Characteristics + +- Supplement global data types +- Visible only in the containing graph +- Support Scalar types (exclusive to local data types) +- Enable pipeline-specific requirements + +### Type Categories + +| Category | Description | +|----------|-------------| +| Structure | Complex type with properties | +| Table | Collection type with row structure | +| Scalar | Simple type from template (LOCAL ONLY) | + +### Creating Local Data Types + +**Steps:** + +1. Open pipeline in Modeler +2. Select Show Configuration (no operators selected) +3. Expand Data Types section +4. Click plus icon → Create Data Type dialog +5. Enter name (start with letter, use letters/numbers/underscores) +6. Select type category +7. For Structure/Table: Add properties and description +8. For Scalar: Select template from options +9. Save + +### Scalar Type Templates + +Scalar types are exclusive to local data types. Select from provided templates based on your data requirements. + +--- + +## Operator Metrics + +Operator metrics provide runtime performance data for Structured Data Operators and Connectivity operators. + +### Consumer Operator Metrics + +| Metric | Unit | Description | +|--------|------|-------------| +| Optimized | Boolean | Shows if operator is combined with others from same engine | +| Row Count | rows | Quantity of rows retrieved from source | +| Column Count | columns | Quantity of columns retrieved from source | +| Partition Count | partitions | Number of partitions when partitioning enabled | + +**Note**: When Optimized is activated, runtime metrics aren't shown. + +### Producer Operator Metrics + +| Metric | Unit | Description | +|--------|------|-------------| +| Row Count | rows | Quantity of rows sent to target | +| Current row rate | rows/s | Throughput velocity when writing | +| Batch count | batches | Number of batches when batch writing configured | +| Elapsed execution time | seconds | Duration of graph processing | + +### Debug Mode Metrics + +| Metric | Unit | Description | +|--------|------|-------------| +| Job CPU usage | % | Execution engine processor consumption | +| Job memory usage | KB | Execution engine memory consumption | +| Operator CPU usage | % | Operator subengine processor consumption | +| Memory usage | KB | Operator subengine memory consumption | + +**Access**: Metrics are automatically published upon graph execution. + +--- + +## Example Graph Templates (141 Available) + +Pre-built graph templates organized by category. + +### Data Integration & ETL + +| Template | Purpose | +|----------|---------| +| ABAP | ABAP system data extraction | +| BW HANA View to File | Full load from BW to files | +| Data Extraction from SAP ABAP Tables | SLT to File Store/Kafka | +| Data Extraction from SAP S/4HANA CDS Views | CDS to File Store/Kafka | +| HANA-to-File | HANA export to files | +| HANA-to-HANA | HANA replication | +| HANA-to-Kafka | HANA streaming to Kafka | +| Kafka-to-HANA | Kafka ingestion to HANA | +| Load/Ingest Files into SAP HANA | Full and incremental load | + +### Machine Learning Templates + +| Template | Purpose | +|----------|---------| +| Auto-ML Training and Inference | Automated ML workflows | +| TensorFlow Training | TensorFlow model training | +| TensorFlow MNIST | MNIST digit classification | +| TensorFlow Serving | Model serving | +| HANA-ML Forecast | Time series forecasting | +| HANA-ML Training/Inference | HANA PAL models | +| PyTorch Text Classification | NLP classification | +| ML Batch Inference | Batch scoring | +| ML Multi-Model Inference | Multiple model serving | +| R Classification | R-based classification | +| R Regression | R-based regression | + +### Streaming & Real-Time + +| Template | Purpose | +|----------|---------| +| Kafka Integration | Kafka producer/consumer | +| Google Pub/Sub | GCP messaging | +| Streaming Analytics | Real-time analytics | +| IoT Validation | IoT data validation | +| Message Generator | Test message generation | + +### Cloud & API Integration + +| Template | Purpose | +|----------|---------| +| Google BigQuery SQL | BigQuery queries | +| Google BigQuery Table Producer | BigQuery writes | +| Google Dataproc | Spark on GCP | +| AWS S3 | S3 storage operations | +| OData Query | OData service queries | +| REST API Client | REST service calls | +| Open Connectors | Multi-cloud connectivity | + +### Data Quality & Transformation + +| Template | Purpose | +|----------|---------| +| DQMM Address Cleanse | Address data cleansing | +| DQMM Person/Firm Cleanse | Entity cleansing | +| Data Masking | PII masking | +| Data Validation | Data quality rules | +| Multiplexer | Dynamic input/output routing | +| Binary-to-Table | Binary conversion | +| Table-to-Binary | Table serialization | +| Anonymization | Data anonymization | + +### Script & Development + +| Template | Purpose | +|----------|---------| +| Python Examples | Python operator demos | +| JavaScript Examples | JS operator demos | +| Node.js Examples | Node.js integration | +| R Examples | R script integration | +| Jupyter Examples | Notebook integration | + +**Access**: Templates available in Modeler → Graphs → Search or browse categories + +--- + +## Documentation Links + +- **Graph Snippets**: [https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/tree/main/docs/modelingguide/using-graph-snippets](https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/tree/main/docs/modelingguide/using-graph-snippets) +- **Cloud Integration**: [https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/blob/main/docs/modelingguide/integrating-sap-cloud-applications-with-sap-data-intelligence-d6a8144.md](https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/blob/main/docs/modelingguide/integrating-sap-cloud-applications-with-sap-data-intelligence-d6a8144.md) +- **Configuration Types**: [https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/blob/main/docs/modelingguide/creating-configuration-types-2e63e4c.md](https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/blob/main/docs/modelingguide/creating-configuration-types-2e63e4c.md) +- **Local Data Types**: [https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/blob/main/docs/modelingguide/creating-local-data-types-c996f5e.md](https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/blob/main/docs/modelingguide/creating-local-data-types-c996f5e.md) +- **Repository Graphs**: [https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/tree/main/docs/repositoryobjects/data-intelligence-graphs](https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/tree/main/docs/repositoryobjects/data-intelligence-graphs) + +--- + +**Last Updated**: 2025-11-22 diff --git a/references/operators-reference.md b/references/operators-reference.md new file mode 100644 index 0000000..5f23995 --- /dev/null +++ b/references/operators-reference.md @@ -0,0 +1,440 @@ +# SAP Data Intelligence Operators Reference + +Complete reference for built-in operators in SAP Data Intelligence. + +## Table of Contents + +1. [Operator Generations](#operator-generations) +2. [ABAP Operators](#abap-operators) +3. [File and Storage Operators](#file-and-storage-operators) +4. [Database Operators](#database-operators) +5. [Messaging Operators](#messaging-operators) +6. [Script Operators](#script-operators) +7. [Data Processing Operators](#data-processing-operators) +8. [Machine Learning Operators](#machine-learning-operators) +9. [Integration Operators](#integration-operators) +10. [Workflow Operators](#workflow-operators) + +--- + +## Operator Generations + +### Generation 1 (Gen1) + +Legacy operators with broad compatibility. + +**Characteristics:** +- Process-based execution +- Manual error handling +- Suitable for simpler workflows +- Compatible with older graphs + +### Generation 2 (Gen2) + +Enhanced operators with advanced features. + +**Characteristics:** +- Improved error recovery +- State management with snapshots +- Native multiplexing support +- Better performance characteristics + +**Gen2 Exclusive Features:** +- Automatic graph recovery from failures +- Periodic state checkpoints +- Structured native message streaming +- Enhanced Python3 operator + +**Critical Rule:** Gen1 and Gen2 operators cannot be mixed in the same graph. + +--- + +## ABAP Operators + +Operators for integrating ABAP-based SAP systems. + +### ABAP CDS Reader + +Reads data from ABAP CDS views. + +**Configuration:** +- Connection: ABAP system connection +- CDS View: Target view name +- Selection: Filter criteria +- Package Size: Batch size for reading + +**Ports:** +- Output: CDS view data + +### ABAP Table Reader + +Reads data from ABAP tables. + +**Configuration:** +- Connection: ABAP system connection +- Table: Target table name +- Fields: Column selection +- Where Clause: Filter condition + +### SLT Connector + +Connects to SAP Landscape Transformation for real-time replication. + +**Configuration:** +- Mass Transfer ID: SLT configuration +- Table: Source table +- Initial Load: Enable/disable full load +- Delta Load: Enable/disable CDC + +### ODP Consumer + +Consumes Operational Data Provisioning sources. + +**Configuration:** +- Connection: ABAP connection +- ODP Context: Extraction context (SAPI, ABAP CDS, etc.) +- ODP Name: Data provider name +- Extraction Mode: Full or Delta + +--- + +## File and Storage Operators + +### Binary File Consumer + +Reads binary files from storage. + +**Configuration:** +- Connection: Storage connection +- Path: File path pattern +- Recursive: Include subdirectories + +**Output:** Binary content + +### Binary File Producer + +Writes binary content to files. + +**Configuration:** +- Connection: Storage connection +- Path: Output file path +- Mode: Overwrite, Append, or Fail if exists + +### Structured File Consumer + +Reads structured data from files (CSV, Parquet, ORC, JSON). + +**Configuration:** +- Connection: Storage connection +- Source: File path or pattern +- Format: CSV, Parquet, ORC, JSON +- Schema: Column definitions + +### Structured File Producer + +Writes structured data to files. + +**Configuration:** +- Connection: Storage connection +- Target: Output path +- Format: CSV, Parquet, ORC, JSON +- Partition: Partitioning strategy + +### Cloud Storage Operators + +**Amazon S3:** +- S3 Consumer: Read from S3 buckets +- S3 Producer: Write to S3 buckets + +**Azure Blob/ADLS:** +- Azure Blob Consumer/Producer +- ADLS Gen2 Consumer/Producer + +**Google Cloud Storage:** +- GCS Consumer: Read from GCS buckets +- GCS Producer: Write to GCS buckets + +**HDFS:** +- HDFS Consumer: Read from Hadoop clusters +- HDFS Producer: Write to Hadoop clusters + +--- + +## Database Operators + +### SAP HANA Operators + +**HANA Client:** +- Executes SQL statements +- Supports DDL, DML, queries + +**Table Consumer:** +- Reads from HANA tables +- Supports filtering and projection + +**Table Producer:** +- Writes to HANA tables +- Supports INSERT, UPSERT, DELETE + +**Flowgraph Executor:** +- Runs HANA calculation views +- Executes stored procedures + +### SQL Operators + +**SQL Consumer:** +- Executes SELECT queries +- Supports parameterized queries + +**SQL Executor:** +- Runs DDL/DML statements +- Returns affected row count + +### Supported Databases + +- SAP HANA (Cloud and on-premise) +- SAP BW/4HANA +- Microsoft SQL Server +- Oracle Database +- PostgreSQL +- MySQL +- SAP IQ/Sybase + +--- + +## Messaging Operators + +### Kafka Operators + +**Kafka Consumer:** +- Subscribes to Kafka topics +- Supports consumer groups +- Offset management (earliest, latest, committed) + +**Kafka Producer:** +- Publishes to Kafka topics +- Key/value serialization +- Partitioning strategies + +### MQTT Operators + +**MQTT Consumer:** +- Subscribes to MQTT topics +- QoS level configuration + +**MQTT Producer:** +- Publishes MQTT messages +- Retain flag support + +### Additional Messaging + +- **NATS**: Lightweight messaging +- **WAMP**: Web Application Messaging Protocol +- **AWS SNS**: Amazon Simple Notification Service +- **SAP Event Mesh**: SAP cloud messaging + +--- + +## Script Operators + +### Python Operator (Gen2) + +Execute Python code within graphs. + +**Configuration:** +- Script: Python code +- Codelanguage: python36 or python39 + +**Example:** +```python +def on_input(msg_id, header, body): + # Process input + result = transform(body) + api.send("output", api.Message(result)) + +api.set_port_callback("input", on_input) +``` + +### JavaScript Operator + +Execute JavaScript/Node.js code. + +**Configuration:** +- Script: JavaScript code + +**Example:** +```javascript +$.setPortCallback("input", function(ctx, s) { + var result = process(s); + $.output(result); +}); +``` + +### R Operator + +Execute R scripts for statistical analysis. + +**Configuration:** +- Script: R code +- Libraries: Required R packages + +### Go Operator + +Execute Go code for high-performance processing. + +--- + +## Data Processing Operators + +### Data Transform + +Visual SQL-like transformation editor. + +**Nodes:** +- Projection: Column selection/transformation +- Aggregation: GROUP BY with functions +- Join: Combine datasets +- Union: Merge datasets +- Case: Conditional logic +- Filter: Row filtering + +### Data Quality Operators + +**Validation Rule:** +- Define data quality rules +- Generate validation reports + +**Anonymization:** +- Mask sensitive data +- Hash, shuffle, or generalize + +**Data Mask:** +- Apply masking patterns +- Preserve format while anonymizing + +### Conversion Operators + +**Type Converters:** +- Binary to Table +- Table to Binary +- Dynamic to Static +- Static to Dynamic + +**Format Converters:** +- JSON Parser/Formatter +- CSV Parser/Formatter +- Avro Encoder/Decoder + +--- + +## Machine Learning Operators + +### TensorFlow Operators + +**TensorFlow Training:** +- Train TensorFlow models +- Distributed training support + +**TensorFlow Serving:** +- Deploy TensorFlow models +- REST API inference + +### PyTorch Operators + +**PyTorch Training:** +- Train PyTorch models +- GPU acceleration + +### HANA ML Operators + +**HANA ML Training:** +- Train models in HANA +- Automated ML (AutoML) + +**HANA ML Inference:** +- Score data with HANA ML models + +### Metrics Operators + +**Submit Metrics:** +- Track training metrics +- Integration with Metrics Explorer + +--- + +## Integration Operators + +### OData Operators + +**OData Consumer:** +- Query OData services +- Supports v2 and v4 + +**OData Producer:** +- Expose data as OData +- CRUD operations + +### REST API Operators + +**REST Client:** +- Call REST APIs +- Configurable HTTP methods +- Header/body templates + +**OpenAPI Client:** +- Generate clients from OpenAPI specs +- Automatic request/response handling + +### SAP Integration + +**SAP CPI Operator:** +- Trigger SAP Cloud Platform Integration flows + +**SAP Application Consumer/Producer:** +- Connect to SAP applications +- S/4HANA, ECC, SuccessFactors + +--- + +## Workflow Operators + +### Data Workflow Operators + +**Workflow Trigger:** +- Start workflow execution +- Scheduled or event-based + +**Workflow Terminator:** +- End workflow with status + +**Pipeline Executor:** +- Run child pipelines +- Pass parameters + +### Control Flow + +**BW Process Chain:** +- Execute SAP BW process chains + +**Data Services Job:** +- Run SAP Data Services jobs + +**HANA Flowgraph:** +- Execute HANA calculation views + +### Notification + +**Email Notification:** +- Send status emails +- Configurable templates + +--- + +## Documentation Links + +- **Operators Reference**: [https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/tree/main/docs/repositoryobjects/data-intelligence-operators](https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/tree/main/docs/repositoryobjects/data-intelligence-operators) +- **Graphs Reference**: [https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/tree/main/docs/repositoryobjects/data-intelligence-graphs](https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/tree/main/docs/repositoryobjects/data-intelligence-graphs) + +--- + +**Last Updated**: 2025-11-22 diff --git a/references/replication-flows.md b/references/replication-flows.md new file mode 100644 index 0000000..0e8d070 --- /dev/null +++ b/references/replication-flows.md @@ -0,0 +1,379 @@ +# Replication Flows Guide + +Complete guide for data replication in SAP Data Intelligence. + +## Table of Contents + +1. [Overview](#overview) +2. [Creating Replication Flows](#creating-replication-flows) +3. [Supported Sources](#supported-sources) +4. [Supported Targets](#supported-targets) +5. [Task Configuration](#task-configuration) +6. [Filters and Mappings](#filters-and-mappings) +7. [Delivery Guarantees](#delivery-guarantees) +8. [Cloud Storage Target Structure](#cloud-storage-target-structure) +9. [Kafka as Target](#kafka-as-target) +10. [Monitoring and Management](#monitoring-and-management) + +--- + +## Overview + +Replication flows enable data movement from sources to targets with support for: + +- Small or large datasets +- Batch or real-time processing +- Full or delta (CDC) loading +- Multiple target types + +**Key Workflow:** +1. Configure source and target connections +2. Create replication flow +3. Add tasks with datasets +4. Configure filters and mappings +5. Validate flow +6. Deploy to tenant repository +7. Run and monitor + +--- + +## Creating Replication Flows + +### Prerequisites + +- Source connection created and enabled in Connection Management +- Target connection configured +- Appropriate authorizations + +### Creation Steps + +1. **Open Modeler** in SAP Data Intelligence +2. **Navigate** to Replication Flows +3. **Create new** replication flow +4. **Configure source**: + - Select source connection + - Choose connection type (ABAP, database, etc.) + +5. **Configure target**: + - Select target type (database, cloud storage, Kafka) + - Set target-specific options + +6. **Add tasks** (see Task Configuration) +7. **Validate** the flow +8. **Deploy** to tenant repository +9. **Run** the flow + +--- + +## Supported Sources + +### ABAP Systems + +- SAP S/4HANA (Cloud and On-Premise) +- SAP ECC via SLT +- SAP BW/4HANA +- CDS views with extraction + +**Source Configuration:** +``` +Connection Type: ABAP +Extraction Type: CDS / ODP / Table +Package Size: 50000 +``` + +### Databases + +- SAP HANA +- Azure SQL Database (delta requires schema = username) +- Other SQL databases via connectors + +--- + +## Supported Targets + +### Database Targets + +**SAP HANA Cloud:** +- Write modes: INSERT, UPSERT, DELETE +- Exactly-once delivery with UPSERT +- Batch size configuration + +### Cloud Storage Targets + +| Target | Description | +|--------|-------------| +| Amazon S3 | AWS object storage | +| Azure Data Lake Storage Gen2 | Microsoft cloud storage | +| Google Cloud Storage | GCP object storage | +| SAP HANA Data Lake | SAP cloud data lake | + +**Cloud Storage Options:** +- Group Delta By: None, Date, Hour +- File Type: CSV, Parquet, JSON, JSONLines +- Suppress Duplicates: Minimize duplicate records + +**Container Name Limit:** 64 characters maximum + +### Kafka Target + +- Each dataset maps to a Kafka topic +- Topic names editable (need not match source) +- No container name limit + +--- + +## Task Configuration + +Tasks define what data to replicate and how. + +### Task Components + +``` +Task: + - Source dataset (table, view, etc.) + - Target specification + - Filter conditions + - Column mappings + - Load type (Initial/Delta) +``` + +### Load Types + +| Type | Description | +|------|-------------| +| Initial Load | Full data extraction | +| Delta Load | Changed data only (CDC) | +| Initial + Delta | Full load followed by continuous delta | + +### Creating Tasks + +1. Click "Add Task" +2. Select source object +3. Configure target (table name, topic, etc.) +4. Set filters (optional) +5. Define mappings (optional) +6. Choose load type + +--- + +## Filters and Mappings + +### Source Filters + +Reduce data volume with filter conditions: + +``` +Filter Examples: +- CreationDate ge datetime'2024-01-01T00:00:00' +- Region eq 'EMEA' +- Status in ('ACTIVE', 'PENDING') +``` + +### Column Mappings + +**Auto-mapping:** System matches source to target columns automatically + +**Custom Mapping:** Define specific source-to-target column relationships + +``` +Custom Mapping Example: + Source Column -> Target Column + SalesOrder -> SALES_ORDER_ID + SoldToParty -> CUSTOMER_ID + NetAmount -> AMOUNT +``` + +### Data Type Compatibility + +Ensure source and target data types are compatible. See `references/abap-integration.md` for ABAP type mappings. + +--- + +## Delivery Guarantees + +### Default: At-Least-Once + +May result in duplicate records during: +- Recovery from failures +- Network issues +- System restarts + +### Exactly-Once with Database Targets + +When using UPSERT to database targets (e.g., HANA Cloud): +- System eliminates duplicates automatically +- Achieved through key-based merge operations + +### Suppress Duplicates (Cloud Storage) + +For non-database targets: +- Enable "Suppress Duplicates" during initial load +- Minimizes but may not eliminate all duplicates + +--- + +## Cloud Storage Target Structure + +### Directory Hierarchy + +``` +// + .sap.rms.container # Container metadata + / + .sap.partfile.metadata # Dataset metadata + initial/ + .sap.partfile.metadata + part---. + _SUCCESS # Load completion marker + delta/ + / + .sap.partfile.metadata + part---. +``` + +### File Formats + +| Format | Options | +|--------|---------| +| CSV | Delimiter, header row, encoding | +| Parquet | Compression (SNAPPY, GZIP), compatibility mode | +| JSON | Standard JSON format | +| JSONLines | One JSON object per line | + +### Appended Columns + +System automatically adds metadata columns: + +| Column | Description | +|--------|-------------| +| `__operation_type` | L=Load, I=Insert, U=Update, B=Before, X=Delete, M=Archive | +| `__sequence_number` | Delta row ordering | +| `__timestamp` | UTC write timestamp | + +### Success Marker + +The `_SUCCESS` file indicates: +- Initial load completion +- Safe for downstream processing + +--- + +## Kafka as Target + +### Topic Configuration + +- One topic per source dataset +- Topic name defaults to dataset name (editable) +- Configure partitions and replication factor + +### Serialization + +| Format | Description | +|--------|-------------| +| AVRO | Schema in message; column names: alphanumeric + underscore only | +| JSON | No schema; flexible structure | + +**Note:** Schema registries are not supported. + +### Message Structure + +- Each source record = one Kafka message (not batched) +- Message key = concatenated primary key values (underscore separated) + +### Message Headers + +| Header | Values | +|--------|--------| +| `kafkaSerializationType` | AVRO or JSON | +| `opType` | L=Load, I=Insert, U=Update, B=Before, X=Delete, M=Archive | +| `Seq` | Sequential integer (delta order); empty for initial load | + +### Compression Options + +- None +- GZIP +- Snappy +- LZ4 +- Zstandard + +### Network Configuration + +For Kafka behind Cloud Connector: +- Broker addresses must match virtual hosts in SCC +- Use identical virtual and internal host values when possible + +--- + +## Monitoring and Management + +### Monitoring Tools + +**SAP Data Intelligence Monitoring:** +- View replication flow status +- Track task execution +- Monitor data volumes +- View error logs + +### Flow Status + +| Status | Description | +|--------|-------------| +| Deployed | Ready to run | +| Running | Active execution | +| Completed | Successfully finished | +| Failed | Error occurred | +| Stopped | Manually stopped | + +### Management Operations + +| Operation | Description | +|-----------|-------------| +| Edit | Modify existing flow | +| Undeploy | Remove from runtime | +| Delete | Remove flow definition | +| Clean Up | Remove source artifacts | + +### Clean Up Source Artifacts + +After completing replication: +1. Navigate to deployed flow +2. Select "Clean Up" +3. Removes delta pointers and temporary data + +--- + +## Best Practices + +### Planning + +1. **Assess Data Volume**: Plan for initial load duration +2. **Choose Delivery Mode**: Understand exactly-once requirements +3. **Design Target Schema**: Match source structure appropriately +4. **Plan Delta Strategy**: Determine grouping (none/date/hour) + +### Performance + +1. **Use Filters**: Reduce data volume at source +2. **Optimize Package Size**: Balance memory vs. round-trips +3. **Monitor Progress**: Track initial and delta loads +4. **Schedule Appropriately**: Avoid peak system times + +### Reliability + +1. **Enable Monitoring**: Track all flows actively +2. **Handle Duplicates**: Design for at-least-once semantics +3. **Validate Before Deploy**: Check all configurations +4. **Test with Sample Data**: Verify mappings and transformations + +--- + +## Documentation Links + +- **Replicating Data**: [https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/tree/main/docs/modelingguide/replicating-data](https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/tree/main/docs/modelingguide/replicating-data) +- **Create Replication Flow**: [https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/blob/main/docs/modelingguide/replicating-data/create-a-replication-flow-a425e34.md](https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/blob/main/docs/modelingguide/replicating-data/create-a-replication-flow-a425e34.md) +- **Cloud Storage Structure**: [https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/blob/main/docs/modelingguide/replicating-data/cloud-storage-target-structure-12e0f97.md](https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/blob/main/docs/modelingguide/replicating-data/cloud-storage-target-structure-12e0f97.md) +- **Kafka as Target**: [https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/blob/main/docs/modelingguide/replicating-data/kafka-as-target-b9b819c.md](https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/blob/main/docs/modelingguide/replicating-data/kafka-as-target-b9b819c.md) + +--- + +**Last Updated**: 2025-11-22 diff --git a/references/security-cdc.md b/references/security-cdc.md new file mode 100644 index 0000000..1499ac1 --- /dev/null +++ b/references/security-cdc.md @@ -0,0 +1,287 @@ +# Security, Data Protection, and CDC Guide + +Complete guide for security, data protection, and change data capture in SAP Data Intelligence. + +## Table of Contents + +1. [Security Overview](#security-overview) +2. [Data Protection](#data-protection) +3. [Audit Logging](#audit-logging) +4. [Change Data Capture (CDC)](#change-data-capture-cdc) +5. [Best Practices](#best-practices) + +--- + +## Security Overview + +### Responsibility Model + +**SAP Data Intelligence Role:** Data processor +**User Role:** Data owner and responsible for: +- PII (Personally Identifiable Information) security +- Regulatory compliance +- Audit trail configuration + +### Design-Time Security + +**Trace Logs:** +- Modeler produces trace logs with design-time artifacts +- Solution files and pipeline descriptions included +- **Do not embed sensitive information** in these objects + +**Connection Management:** +- Use Connection Manager for credentials +- Avoid hardcoding sensitive data in operators +- Leverage secure credential storage + +### Network Security + +**Cloud Connector:** +- TLS encryption for on-premise communication +- Virtual host mapping +- IP restrictions available + +**Principal Propagation:** +- SSO via Cloud Connector +- Certificate-based authentication +- User context preservation + +--- + +## Data Protection + +### PII Handling Guidelines + +1. **Identify PII**: Document all PII fields in data flows +2. **Minimize Collection**: Extract only necessary data +3. **Mask Sensitive Data**: Apply masking/anonymization +4. **Secure Storage**: Encrypt data at rest +5. **Access Control**: Implement authorization checks + +### Data Masking Operators + +| Operator | Purpose | +|----------|---------| +| Data Mask | Apply masking patterns | +| Anonymization | Hash, shuffle, generalize data | +| Validation Rule | Verify data quality | + +### Anonymization Techniques + +``` +Original: john.doe@company.com +Masked: j***@c***.com +Hashed: SHA256(email + salt) +Generalized: user@domain.com +``` + +### Encryption + +**In Transit:** +- HTTPS for all communications +- TLS 1.2+ required +- Certificate validation + +**At Rest:** +- Storage-level encryption +- Key management integration +- Customer-managed keys (where supported) + +--- + +## Audit Logging + +### Responsibility Model + +**SAP Data Intelligence Platform Logs (DI-native):** +- Platform-level access events (user login/logout) +- User actions (pipeline creation, modification, execution) +- System configuration changes +- API access attempts + +**Customer-Configured Logs (upstream/downstream systems):** +SAP Data Intelligence does not generate audit logs for: +- Sensitive data inputs from source systems +- Data transformations applied to PII/sensitive data +- Data outputs written to target systems + +**You must configure** source and target systems to generate audit logs for data-level operations. This is required because DI processes data in transit but does not independently track individual data record access. + +### Recommended Logging Events + +| Event Category | Examples | +|----------------|----------| +| Security Incidents | Unauthorized access attempts | +| Configuration Changes | Pipeline modifications | +| Personal Data Access | PII field reads | +| Data Modifications | Updates to relevant datasets | + +### Compliance Considerations + +**GDPR Requirements:** +- Right to access +- Right to erasure +- Data portability +- Breach notification + +**Implementation:** +1. Document data flows +2. Configure audit logging in source/target systems +3. Maintain data lineage +4. Implement retention policies + +### Administration Guide Reference + +See SAP Data Intelligence Administration Guide for: +- DPP (Data Protection and Privacy) configuration +- Audit log setup +- Compliance reporting + +--- + +## Change Data Capture (CDC) + +### Overview + +CDC enables tracking changes in source systems for incremental data loading. + +### Terminology + +**Cloud Data Integration (CDI)**: An internal component of SAP Data Intelligence that provides connectivity and data movement capabilities. CDI performs polling-based change detection by periodically querying source systems for modified records. + +### CDC Approaches + +| Approach | Technology | Description | +|----------|------------|-------------| +| Trigger-based | Database triggers | Insert/Update/Delete tracking | +| Polling-based | Cloud Data Integration (CDI) | Periodic change detection via scheduled queries | +| Log-based | Transaction logs | Real-time change capture | + +### Supported Databases (Trigger-based) + +- DB2 +- SAP HANA +- Microsoft SQL Server (MSSQL) +- MySQL +- Oracle + +### CDC Operators (Deprecated) + +**Table Replicator V3 (Deprecated):** +- Simplifies graph creation for trigger-based CDC +- Manages trigger creation and change tracking + +**CDC Graph Generator (Deprecated):** +- Automates SQL generation for database-specific triggers +- Reduces manual effort per table + +### Cloud Data Integration CDC + +Cloud Data Integration (CDI) uses **polling-based** CDC technology: +- Periodic checks for changes +- No trigger installation required +- Suitable for cloud sources + +### Replication Flow CDC + +For modern CDC implementations, use **Replication Flows**: +- Built-in delta support +- Multiple source types +- Cloud-native approach + +**Delta Indicators in Replication:** +*(Used in Replication Flows to mark data changes)* + +| Code | Meaning | +|------|---------| +| L | Initial load row | +| I | New row inserted | +| U | Update (after image) | +| B | Update (before image) | +| X | Deleted row | +| M | Archiving operation | + +**Note**: Delta indicators are system-generated by Replication Flows when CDC is enabled. They apply across all supported source types (see Supported Databases section). Downstream operators or target systems can filter on these codes to handle different change types distinctly. + +### Performance Considerations + +CDC performance depends on: +1. Initial table size +2. Rate of changes in source +3. Network latency +4. Target system capacity + +--- + +## Best Practices + +### Security + +1. **Least Privilege**: Grant minimum required permissions +2. **Credential Rotation**: Regularly update passwords/keys (e.g., quarterly or per organizational policy) +3. **Network Segmentation**: Isolate DI from other workloads +4. **Monitoring**: Enable security monitoring and alerts + +### Data Protection + +1. **Data Classification**: Categorize data by sensitivity +2. **Anonymization**: Apply for non-production environments +3. **Access Logging**: Configure source/target systems to track who accesses sensitive data (see [Audit Logging - Responsibility Model](#responsibility-model) for details on DI-native vs. customer-configured logs) +4. **Retention**: Implement data retention policies + +### CDC Implementation + +1. **Choose Approach**: Select CDC method based on requirements +2. **Monitor Performance**: Track CDC overhead on source +3. **Handle Duplicates**: Design for at-least-once semantics (messages/rows may be delivered multiple times; implement idempotent logic in target systems to handle duplicates gracefully) +4. **Test Recovery**: Validate delta restart scenarios + +### Compliance + +1. **Document Everything**: Maintain data flow documentation +2. **Regular Audits**: Conduct periodic compliance reviews +3. **Training**: Ensure team understands DPP requirements +4. **Incident Response**: Have breach response plan ready + +--- + +## Operator Metrics for Monitoring + +### Consumer Metrics + +| Metric | Description | +|--------|-------------| +| Optimized | Whether operator is optimized with others | +| Row Count | Rows read from source | +| Column Count | Columns read from source | +| Partition Count | Partitions being read | + +### Producer Metrics + +| Metric | Description | +|--------|-------------| +| Row Count | Rows written to target | +| Current Row Rate | Rows per second | +| Batch Count | Batches written | +| Elapsed Execution Time | Total runtime | + +### Debug Mode Metrics + +| Metric | Description | +|--------|-------------| +| Job CPU Usage | CPU % by execution engine | +| Job Memory Usage | KB used by execution engine | +| Operator CPU Usage | CPU % by subengine | +| Operator Memory Usage | KB used by subengine | + +--- + +## Documentation Links + +- **Security and Data Protection**: [https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/blob/main/docs/modelingguide/security-and-data-protection-39d8ba5.md](https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/blob/main/docs/modelingguide/security-and-data-protection-39d8ba5.md) +- **Change Data Capture**: [https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/blob/main/docs/modelingguide/changing-data-capture-cdc-023c75a.md](https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/blob/main/docs/modelingguide/changing-data-capture-cdc-023c75a.md) +- **Operator Metrics**: [https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/blob/main/docs/modelingguide/operator-metrics-994bc11.md](https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/blob/main/docs/modelingguide/operator-metrics-994bc11.md) + +--- + +**Last Updated**: 2025-11-22 diff --git a/references/structured-data-operators.md b/references/structured-data-operators.md new file mode 100644 index 0000000..af13dc2 --- /dev/null +++ b/references/structured-data-operators.md @@ -0,0 +1,470 @@ +# Structured Data Operators Guide + +Complete guide for structured data processing in SAP Data Intelligence. + +## Table of Contents + +1. [Overview](#overview) +2. [Data Transform Operator](#data-transform-operator) +3. [Projection Node](#projection-node) +4. [Aggregation Node](#aggregation-node) +5. [Join Node](#join-node) +6. [Union Node](#union-node) +7. [Case Node](#case-node) +8. [Consumer Operators](#consumer-operators) +9. [Producer Operators](#producer-operators) +10. [Resiliency](#resiliency) + +--- + +## Overview + +Structured data operators provide SQL-like data processing capabilities with visual configuration. + +**Key Components:** +- **Data Transform**: Visual SQL editor for transformations +- **Consumer Operators**: Read structured data from sources +- **Producer Operators**: Write structured data to targets + +**Supported Formats:** +- CSV, Parquet, ORC, JSON (files) +- Database tables (HANA, SQL databases) +- SAP applications (S/4HANA, BW) + +--- + +## Data Transform Operator + +The Data Transform operator provides a visual editor for creating SQL-like transformations. + +### Custom Editor Features + +**Visual Designer:** +- Drag-and-drop node creation +- Visual connection of data flows +- Schema preview at each node + +**Available Nodes:** +- Source: Input data connection +- Target: Output data connection +- Projection: Column operations +- Aggregation: GROUP BY operations +- Join: Combine datasets +- Union: Merge datasets +- Case: Conditional logic +- Filter: Row filtering + +### Creating a Data Transform + +1. Add Data Transform operator to graph +2. Open Custom Editor +3. Add Source node (connect to input port) +4. Add transformation nodes +5. Add Target node (connect to output port) +6. Configure each node +7. Validate and close editor + +### Best Practices + +- Use meaningful node names +- Preview data at each step +- Optimize join order for performance +- Use filters early to reduce data volume + +--- + +## Projection Node + +The Projection node performs column-level operations. + +### Capabilities + +**Column Selection:** +- Include/exclude columns +- Rename columns +- Reorder columns + +**Column Transformation:** +- Apply expressions +- Use DTL functions +- Create calculated columns + +### Configuration + +``` +Source Columns: + - CUSTOMER_ID (include) + - CUSTOMER_NAME (include, rename to NAME) + - INTERNAL_CODE (exclude) + +Calculated Columns: + - FULL_NAME: CONCAT(FIRST_NAME, ' ', LAST_NAME) + - ORDER_YEAR: YEAR(ORDER_DATE) +``` + +### Expression Examples + +```sql +-- String concatenation +CONCAT(FIRST_NAME, ' ', LAST_NAME) + +-- Conditional value +CASE WHEN AMOUNT > 1000 THEN 'High' ELSE 'Low' END + +-- Date extraction +YEAR(ORDER_DATE) + +-- Null handling +COALESCE(DISCOUNT, 0) + +-- Type conversion +TO_DECIMAL(QUANTITY * PRICE, 15, 2) +``` + +--- + +## Aggregation Node + +The Aggregation node performs GROUP BY operations with aggregate functions. + +### Aggregate Functions + +| Function | Description | +|----------|-------------| +| COUNT | Count rows | +| COUNT_DISTINCT | Count unique values | +| SUM | Sum of values | +| AVG | Average of values | +| MIN | Minimum value | +| MAX | Maximum value | +| FIRST | First value | +| LAST | Last value | + +### Configuration + +``` +Group By Columns: + - REGION + - PRODUCT_CATEGORY + +Aggregations: + - TOTAL_SALES: SUM(SALES_AMOUNT) + - ORDER_COUNT: COUNT(ORDER_ID) + - AVG_ORDER: AVG(SALES_AMOUNT) + - FIRST_ORDER: MIN(ORDER_DATE) + - LAST_ORDER: MAX(ORDER_DATE) +``` + +### Example Output Schema + +``` +Input: + ORDER_ID, REGION, PRODUCT_CATEGORY, SALES_AMOUNT, ORDER_DATE + +Output: + REGION, PRODUCT_CATEGORY, TOTAL_SALES, ORDER_COUNT, AVG_ORDER, FIRST_ORDER, LAST_ORDER +``` + +--- + +## Join Node + +The Join node combines data from multiple sources. + +### Join Types + +| Type | Description | +|------|-------------| +| INNER | Only matching rows | +| LEFT | All left + matching right | +| RIGHT | All right + matching left | +| FULL | All rows from both sides | + +### Configuration + +``` +Join Type: LEFT +Left Source: ORDERS +Right Source: CUSTOMERS + +Join Conditions: + - ORDERS.CUSTOMER_ID = CUSTOMERS.ID + - ORDERS.REGION = CUSTOMERS.REGION (optional) +``` + +### Design Considerations + +**Performance Tips:** +- Put smaller table on right side for LEFT joins +- Use indexed columns in join conditions +- Filter data before joining when possible +- Avoid Cartesian products (missing join condition) + +**Multiple Joins:** +- Chain Join nodes for 3+ sources +- Consider join order for performance +- Validate intermediate results + +### Handling Duplicates + +``` +Scenario: Customer has multiple orders +Solution: Aggregate before or after join depending on requirement +``` + +--- + +## Union Node + +The Union node combines rows from multiple sources. + +### Union Types + +| Type | Description | +|------|-------------| +| UNION ALL | Include all rows (with duplicates) | +| UNION | Include distinct rows only | + +### Configuration + +``` +Union Type: UNION ALL + +Sources: + - ORDERS_2023 + - ORDERS_2024 + +Column Mapping: + - ORDER_ID -> ORDER_ID + - CUSTOMER -> CUSTOMER_ID (rename) + - AMOUNT -> SALES_AMOUNT (rename) +``` + +### Requirements + +- Same number of columns from each source +- Compatible data types +- Column mapping for different names + +### Adding Source Identifier + +Use a calculated column to track data origin: + +```sql +-- In source 1 projection +'2023' AS DATA_YEAR + +-- In source 2 projection +'2024' AS DATA_YEAR +``` + +--- + +## Case Node + +The Case node applies conditional logic to route or transform data. + +### Conditional Expressions + +```sql +CASE + WHEN ORDER_TYPE = 'SALES' THEN 'Revenue' + WHEN ORDER_TYPE = 'RETURN' THEN 'Refund' + ELSE 'Other' +END AS TRANSACTION_TYPE +``` + +### Routing Data + +Configure multiple output ports based on conditions: + +``` +Condition 1: REGION = 'EMEA' -> Output Port 1 +Condition 2: REGION = 'APAC' -> Output Port 2 +Default: -> Output Port 3 +``` + +### Nested Conditions + +```sql +CASE + WHEN AMOUNT > 10000 THEN + CASE + WHEN CUSTOMER_TYPE = 'VIP' THEN 'Priority High' + ELSE 'Priority Medium' + END + ELSE 'Priority Low' +END AS PRIORITY +``` + +--- + +## Consumer Operators + +Operators that read structured data from sources. + +### Structured File Consumer + +Reads from file storage (S3, Azure, GCS, HDFS, local). + +**Supported Formats:** +- CSV (with header options) +- Parquet +- ORC +- JSON (JSON Lines format) + +**Configuration:** +``` +Connection: S3 Connection +Source: s3://bucket/path/*.parquet +Format: Parquet +Schema: Auto-detect or manual +Partition Pruning: date_column > '2024-01-01' +``` + +**Excel Support:** +- Read Excel files (.xlsx) +- Specify sheet name +- Define header row + +### Structured SQL Consumer + +Reads from SQL databases. + +**Configuration:** +``` +Connection: HANA Connection +Table/View: SALES_DATA +Columns: Select columns +Filter: WHERE clause +``` + +### SAP Application Consumer + +Reads from SAP applications via OData or RFC. + +**Configuration:** +``` +Connection: S/4HANA Connection +Entity: A_SalesOrder +Select: OrderID, CustomerID, NetAmount +Filter: $filter=CreationDate gt '2024-01-01' +``` + +--- + +## Producer Operators + +Operators that write structured data to targets. + +### Structured File Producer + +Writes to file storage. + +**Configuration:** +``` +Connection: S3 Connection +Target: s3://bucket/output/ +Format: Parquet +Partition Columns: YEAR, MONTH +Compression: SNAPPY +``` + +**Partitioning Strategies:** +- By column values (e.g., year, region) +- By time (hourly, daily) +- By size (max rows per file) + +### Structured Table Producer + +Writes to database tables. + +**Write Modes:** +- INSERT: Add new rows +- UPSERT: Insert or update +- DELETE: Remove matching rows +- TRUNCATE_INSERT: Clear and reload + +**Configuration:** +``` +Connection: HANA Connection +Table: TARGET_TABLE +Mode: UPSERT +Key Columns: ID, DATE +Batch Size: 10000 +``` + +### SAP Application Producer + +Writes to SAP applications. + +**Configuration:** +``` +Connection: S/4HANA Connection +Entity: A_SalesOrder +Operation: POST (create) / PATCH (update) +``` + +--- + +## Resiliency + +Structured data operators support resiliency features for reliable processing. + +### Checkpoint Configuration + +``` +Enable Checkpointing: Yes +Checkpoint Interval: 60 seconds +Checkpoint Location: /checkpoint/path +``` + +### Recovery Behavior + +**On Failure:** +1. Graph stops at failure point +2. State saved to checkpoint +3. Manual or auto restart +4. Resume from last checkpoint + +### Best Practices + +- Enable checkpointing for long-running jobs +- Use appropriate checkpoint intervals +- Store checkpoints on reliable storage +- Monitor checkpoint sizes + +### Exactly-Once Processing + +For exactly-once semantics: +- Use UPSERT to database targets +- Enable deduplication for file targets +- Implement idempotent transformations + +--- + +## Example: End-to-End Pipeline + +``` +[Structured File Consumer] -> [Data Transform] -> [Structured Table Producer] + (CSV files) | (HANA table) + | + [Projection] - Select columns + | + [Join] - Enrich with master data + | + [Aggregation] - Summarize by region + | + [Case] - Apply business rules +``` + +--- + +## Documentation Links + +- **Structured Data Operators**: [https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/tree/main/docs/modelingguide/working-with-structureddata-operators](https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/tree/main/docs/modelingguide/working-with-structureddata-operators) +- **Data Transform**: [https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/blob/main/docs/modelingguide/working-with-structureddata-operators/data-transform-8fe8c02.md](https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/blob/main/docs/modelingguide/working-with-structureddata-operators/data-transform-8fe8c02.md) + +--- + +**Last Updated**: 2025-11-22 diff --git a/references/subengines.md b/references/subengines.md new file mode 100644 index 0000000..7384c5b --- /dev/null +++ b/references/subengines.md @@ -0,0 +1,562 @@ +# Subengines Development Guide + +Complete guide for developing operators with subengines in SAP Data Intelligence. + +## Table of Contents + +1. [Overview](#overview) +2. [Subengine Architecture](#subengine-architecture) +3. [Python Subengine](#python-subengine) +4. [Node.js Subengine](#nodejs-subengine) +5. [C++ Subengine](#c-subengine) +6. [FlowAgent Subengine](#flowagent-subengine) +7. [Performance Optimization](#performance-optimization) +8. [Best Practices](#best-practices) + +--- + +## Overview + +Subengines enable operators to run on different runtimes within SAP Data Intelligence. + +**Supported Subengines:** +- **Python 3.9**: Data science and ML workflows +- **Node.js**: JavaScript-based processing +- **C++**: High-performance native operators +- **ABAP**: ABAP Pipeline Engine (source systems) +- **FlowAgent**: Database connectivity + +**Key Benefits:** +- Language flexibility +- Performance optimization +- Specialized libraries +- Same-engine process sharing + +--- + +## Subengine Architecture + +### Execution Model + +``` +Main Engine (Coordinator) + ├── Python Subengine Process + │ ├── Python Operator 1 + │ └── Python Operator 2 (same process) + │ + ├── Node.js Subengine Process + │ └── JavaScript Operator + │ + └── Native Engine Process + └── Native Operator +``` + +### Communication + +**Same Engine Communication:** +- In-memory data transfer +- No serialization overhead +- Optimal performance + +**Cross-Engine Communication:** +- Serialization required +- Inter-process communication +- Higher latency + +### Engine Selection + +The optimizer selects engines to minimize communication: + +``` +Graph: [Python Op A] -> [Python Op B] -> [JS Op C] + +Execution: +- Ops A and B: Same Python process +- Op C: Separate Node.js process +- Data serialized between B and C +``` + +--- + +## Python Subengine + +The most commonly used subengine for data processing and ML. + +### Python 3.9 Operator (Gen2) + +**Creating Python Operator:** + +```python +# Operator script + +def on_input(msg_id, header, body): + """Process incoming message.""" + import pandas as pd + + # Process data + df = pd.DataFrame(body) + result = df.groupby('category').sum() + + # Send output + api.send("output", api.Message(result.to_dict())) + +# Register callback +api.set_port_callback("input", on_input) +``` + +### API Reference + +**Message Handling:** +```python +# Set port callback +api.set_port_callback("port_name", callback_function) + +# Send message +api.send("port_name", api.Message(body, attributes={})) + +# Create message with attributes +msg = api.Message( + body={"data": values}, + attributes={"source": "python"} +) +``` + +**Configuration Access:** +```python +# Get configuration parameter +value = api.config.param_name + +# Get with default +value = getattr(api.config, 'param_name', default_value) +``` + +**Logging:** +```python +api.logger.info("Processing started") +api.logger.warning("Potential issue detected") +api.logger.error("Error occurred") +``` + +### State Management + +```python +# Initialize state +state = {"counter": 0, "cache": {}} + +def on_input(msg_id, header, body): + global state + state["counter"] += 1 + + # Process with state + if body["id"] in state["cache"]: + result = state["cache"][body["id"]] + else: + result = process(body) + state["cache"][body["id"]] = result + + api.send("output", api.Message(result)) + +api.set_port_callback("input", on_input) +``` + +### Using External Libraries + +```python +# Pre-installed libraries available +import pandas as pd +import numpy as np +import sklearn +import tensorflow +import torch + +# Custom libraries via Dockerfile +# (see Creating Dockerfiles section) +``` + +### Managed Connections + +Access database connections from Python: + +```python +def on_input(msg_id, header, body): + # Get connection + conn = api.get_connection("HANA_CONNECTION") + + # Execute query + cursor = conn.cursor() + cursor.execute("SELECT * FROM TABLE") + rows = cursor.fetchall() + + api.send("output", api.Message(rows)) +``` + +--- + +## Node.js Subengine + +JavaScript-based operator development. + +### Creating Node.js Operator + +```javascript +// Operator script using @sap/vflow-sub-node-sdk +const { Operator } = require("@sap/vflow-sub-node-sdk"); + +// Get operator instance +const operator = Operator.getInstance(); + +// Set up input port handler +operator.getInPort("input").onMessage((ctx) => { + // Process message + const data = ctx.body; + const result = processData(data); + + // Send to output port + operator.getOutPort("output").send(result); +}); + +function processData(data) { + // Transform data + return data.map((item) => { + return { + id: item.id, + value: item.value * 2 + }; + }); +} +``` + +### API Reference + +**Message Handling:** +```javascript +const { Operator } = require("@sap/vflow-sub-node-sdk"); +const operator = Operator.getInstance(); + +// Set port callback +operator.getInPort("port_name").onMessage((ctx) => { + // Access message body via ctx.body + const data = ctx.body; + // Process message +}); + +// Send to output port +operator.getOutPort("output").send(data); + +// Send to specific named port +operator.getOutPort("port_name").send(data); +``` + +**Configuration:** +```javascript +const operator = Operator.getInstance(); + +// Access config +const paramValue = operator.config.paramName; +``` + +**Logging:** +```javascript +const operator = Operator.getInstance(); + +// Use operator logger +operator.logger.info("Information message"); +operator.logger.debug("Debug message"); +operator.logger.error("Error message"); +``` + +### Node.js Data Types + +| SAP DI Type | Node.js Type | +|-------------|--------------| +| string | String | +| int32 | Number | +| int64 | BigInt | +| float32 | Number | +| float64 | Number | +| blob | Buffer | +| message | Object | + +### Safe Integer Handling + +```javascript +// Large integers may lose precision +// Use BigInt for int64 values +const { Operator } = require("@sap/vflow-sub-node-sdk"); +const operator = Operator.getInstance(); + +operator.getInPort("input").onMessage((ctx) => { + const value = BigInt(ctx.body.largeNumber); + // Process safely +}); +``` + +### Node Modules + +```javascript +// Built-in modules available +var fs = require('fs'); +var path = require('path'); +var https = require('https'); + +// Custom modules via Dockerfile +``` + +--- + +## C++ Subengine + +High-performance native operator development. + +### Getting Started + +1. Install C++ SDK +2. Create operator class +3. Implement interfaces +4. Compile and upload + +### Operator Implementation + +```cpp +// custom_operator.h + +#include "sdi/subengine.h" +#include "sdi/operator.h" + +class CustomOperator : public sdi::BaseOperator { +public: + CustomOperator(const sdi::OperatorConfig& config); + + void init() override; + void start() override; + void shutdown() override; + +private: + void onInput(const sdi::Message& msg); + + std::string m_parameter; +}; +``` + +```cpp +// custom_operator.cpp + +#include "custom_operator.h" + +CustomOperator::CustomOperator(const sdi::OperatorConfig& config) + : BaseOperator(config) { + m_parameter = config.get("parameter"); +} + +void CustomOperator::init() { + registerPortCallback("input", + [this](const sdi::Message& msg) { onInput(msg); }); +} + +void CustomOperator::start() { + LOG_INFO("Operator started"); +} + +void CustomOperator::onInput(const sdi::Message& msg) { + // Process message + auto data = msg.body>(); + + // Transform + for (auto& val : data) { + val *= 2; + } + + // Send output + send("output", sdi::Message(data)); +} + +void CustomOperator::shutdown() { + LOG_INFO("Operator shutdown"); +} +``` + +### Building and Uploading + +```bash +# Build +mkdir build && cd build +cmake .. +make + +# Package +tar -czvf operator.tar.gz libcustom_operator.so manifest.json + +# Upload via System Management +``` + +--- + +## FlowAgent Subengine + +Database connectivity subengine. + +### Purpose + +FlowAgent provides: +- Database connection pooling +- Efficient data transfer +- Native database drivers + +### Supported Databases + +- SAP HANA +- SAP IQ +- Microsoft SQL Server +- Oracle +- PostgreSQL +- MySQL +- DB2 + +### FlowAgent Operators + +Pre-built operators using FlowAgent: + +- **SQL Consumer**: Execute SELECT queries +- **SQL Executor**: Execute DDL/DML +- **Table Consumer**: Read tables +- **Table Producer**: Write tables + +### Configuration + +``` +Connection: Database Connection ID +SQL Statement: SELECT * FROM SALES WHERE YEAR = 2024 +Batch Size: 10000 +Fetch Size: 5000 +``` + +--- + +## Performance Optimization + +### Minimize Cross-Engine Communication + +``` +Bad: +[Python A] -> [JS B] -> [Python C] -> [JS D] +(4 serialization points) + +Good: +[Python A] -> [Python C] -> [JS B] -> [JS D] +(2 serialization points) +``` + +### Batch Processing + +```python +# Process in batches +def on_input(msg_id, header, body): + batch = [] + batch.append(body) + + if len(batch) >= 1000: + process_batch(batch) + batch.clear() +``` + +### Memory Management + +```python +# Stream large data +def on_input(msg_id, header, body): + import pandas as pd + + # Process in chunks + for chunk in pd.read_csv(body, chunksize=10000): + result = process(chunk) + api.send("output", api.Message(result)) +``` + +### Connection Pooling + +```python +# Reuse connections +_connection = None + +def get_connection(): + global _connection + if _connection is None: + _connection = api.get_connection("DB_CONN") + return _connection +``` + +--- + +## Best Practices + +### Operator Design + +1. **Single Responsibility**: One task per operator +2. **Stateless When Possible**: Easier recovery +3. **Handle Errors Gracefully**: Try-catch with logging +4. **Clean Up Resources**: Close connections, files + +### Code Organization + +```python +# Good: Modular code + +def validate_input(data): + """Validate input data.""" + if not data: + raise ValueError("Empty input") + return True + +def transform_data(data): + """Transform data.""" + # Transformation logic + return result + +def on_input(msg_id, header, body): + try: + validate_input(body) + result = transform_data(body) + api.send("output", api.Message(result)) + except Exception as e: + api.logger.error(f"Error: {e}") + api.send("error", api.Message({"error": str(e)})) +``` + +### Testing + +1. **Unit Test Logic**: Test functions independently +2. **Integration Test**: Test with sample data +3. **Performance Test**: Verify throughput + +### Documentation + +```python +def on_input(msg_id, header, body): + """ + Process incoming sales data. + + Input Schema: + - order_id: string + - amount: float + - date: string (YYYY-MM-DD) + + Output Schema: + - order_id: string + - processed_amount: float + - quarter: int + """ + # Implementation +``` + +--- + +## Documentation Links + +- **Subengines Overview**: [https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/tree/main/docs/modelingguide/subengines](https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/tree/main/docs/modelingguide/subengines) +- **Python Subengine**: [https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/blob/main/docs/modelingguide/subengines/create-operators-with-the-python-subengine-7e8f7d2.md](https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/blob/main/docs/modelingguide/subengines/create-operators-with-the-python-subengine-7e8f7d2.md) +- **Node.js Subengine**: [https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/tree/main/docs/modelingguide/subengines](https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/tree/main/docs/modelingguide/subengines) (Node.js SDK files) +- **C++ Subengine**: [https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/blob/main/docs/modelingguide/subengines/working-with-the-c-subengine-to-create-operators-d8f634c.md](https://github.com/SAP-docs/sap-hana-cloud-data-intelligence/blob/main/docs/modelingguide/subengines/working-with-the-c-subengine-to-create-operators-d8f634c.md) + +--- + +**Last Updated**: 2025-11-22 diff --git a/templates/basic-graph.json b/templates/basic-graph.json new file mode 100644 index 0000000..4dbe59a --- /dev/null +++ b/templates/basic-graph.json @@ -0,0 +1,100 @@ +{ + "name": "basic_data_pipeline", + "description": "Basic data processing pipeline template", + "version": "1.0.0", + "properties": { + "autoRecovery": { + "enabled": true, + "snapshotInterval": "60s" + } + }, + "parameters": [ + { + "name": "source_connection", + "type": "string", + "description": "Connection ID for source system (configure in Connection Management)", + "default": "" + }, + { + "name": "target_connection", + "type": "string", + "description": "Connection ID for target system (configure in Connection Management)", + "default": "" + }, + { + "name": "source_path", + "type": "string", + "description": "Source file path", + "default": "/data/input/" + }, + { + "name": "target_path", + "type": "string", + "description": "Target file path", + "default": "/data/output/" + }, + { + "name": "batch_size", + "type": "int32", + "description": "Processing batch size", + "default": 10000 + } + ], + "operators": [ + { + "name": "source", + "component": "com.sap.system.structuredFileConsumer", + "config": { + "connection": "${source_connection}", + "path": "${source_path}", + "format": "csv", + "header": true + } + }, + { + "name": "transform", + "component": "com.sap.dataTransform", + "config": { + "transformations": [] + } + }, + { + "name": "target", + "component": "com.sap.system.structuredFileProducer", + "config": { + "connection": "${target_connection}", + "path": "${target_path}", + "format": "parquet", + "compression": "SNAPPY" + } + } + ], + "connections": [ + { + "source": { + "operator": "source", + "port": "output" + }, + "target": { + "operator": "transform", + "port": "input" + } + }, + { + "source": { + "operator": "transform", + "port": "output" + }, + "target": { + "operator": "target", + "port": "input" + } + } + ], + "notes": [ + "This is a basic Gen2 pipeline template.", + "Configure source and target connections before running.", + "Customize transformations in the Data Transform operator.", + "Auto-recovery is enabled with 60-second snapshot intervals." + ] +} diff --git a/templates/ml-training-pipeline.json b/templates/ml-training-pipeline.json new file mode 100644 index 0000000..2b9f1c0 --- /dev/null +++ b/templates/ml-training-pipeline.json @@ -0,0 +1,99 @@ +{ + "$schema": "https://sap.github.io/data-intelligence/schemas/graph.json", + "name": "ml_training_pipeline", + "description": "Machine learning training pipeline template", + "version": "1.0.0", + "properties": { + "autoRecovery": { + "enabled": true, + "snapshotInterval": "120s" + } + }, + "parameters": [ + { + "name": "dataset_path", + "type": "string", + "description": "Training dataset path", + "default": "/ml/datasets/training.parquet" + }, + { + "name": "model_output_path", + "type": "string", + "description": "Model output path", + "default": "/ml/models/" + }, + { + "name": "test_split", + "type": "float64", + "description": "Test data split ratio", + "default": 0.2 + }, + { + "name": "n_estimators", + "type": "int32", + "description": "Number of estimators for RandomForest", + "default": 100 + }, + { + "name": "max_depth", + "type": "int32", + "description": "Maximum tree depth", + "default": 10 + } + ], + "operators": [ + { + "name": "data_loader", + "component": "com.sap.system.structuredFileConsumer", + "config": { + "connection": "${data_connection}", + "path": "${dataset_path}", + "format": "parquet" + } + }, + { + "name": "trainer", + "component": "com.sap.system.python3Operator", + "config": { + "script": "# ML Training Script\nimport pandas as pd\nimport pickle\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.metrics import accuracy_score, f1_score\nfrom sapdi import tracking\n\ndef on_input(msg_id, header, body):\n # Get parameters\n test_split = float(api.config.test_split)\n n_estimators = int(api.config.n_estimators)\n max_depth = int(api.config.max_depth)\n \n # Load data\n df = pd.DataFrame(body)\n \n # Prepare features and target\n X = df.drop('target', axis=1)\n y = df['target']\n \n # Split data\n X_train, X_test, y_train, y_test = train_test_split(\n X, y, test_size=test_split, random_state=42\n )\n \n # Train model\n model = RandomForestClassifier(\n n_estimators=n_estimators,\n max_depth=max_depth,\n random_state=42\n )\n model.fit(X_train, y_train)\n \n # Evaluate\n y_pred = model.predict(X_test)\n accuracy = accuracy_score(y_test, y_pred)\n f1 = f1_score(y_test, y_pred, average='weighted')\n \n # Track with ML Scenario Manager\n with tracking.start_run(run_name='rf_training') as run:\n run.log_param('algorithm', 'RandomForest')\n run.log_param('n_estimators', n_estimators)\n run.log_param('max_depth', max_depth)\n run.log_param('test_split', test_split)\n run.log_metric('accuracy', accuracy)\n run.log_metric('f1_score', f1)\n run.log_artifact('model.pkl', pickle.dumps(model))\n \n # Send results\n result = {\n 'accuracy': accuracy,\n 'f1_score': f1,\n 'model_path': f'{api.config.model_output_path}model.pkl'\n }\n api.send('output', api.Message(result))\n\napi.set_port_callback('input', on_input)" + } + }, + { + "name": "metrics_logger", + "component": "com.sap.ml.submitMetrics", + "config": { + "metricsType": "training" + } + } + ], + "connections": [ + { + "source": { + "operator": "data_loader", + "port": "output" + }, + "target": { + "operator": "trainer", + "port": "input" + } + }, + { + "source": { + "operator": "trainer", + "port": "output" + }, + "target": { + "operator": "metrics_logger", + "port": "input" + } + } + ], + "notes": [ + "This is a machine learning training pipeline template.", + "Customize the Python script for your specific model.", + "Configure dataset path and model parameters.", + "Metrics are automatically tracked in ML Scenario Manager.", + "Model artifacts are logged for versioning and deployment.", + "Adjust test_split, n_estimators, max_depth as needed." + ] +} diff --git a/templates/replication-flow.json b/templates/replication-flow.json new file mode 100644 index 0000000..c271b61 --- /dev/null +++ b/templates/replication-flow.json @@ -0,0 +1,90 @@ +{ + "name": "abap_to_hana_replication", + "description": "Replication flow from ABAP system to HANA Cloud", + "version": "1.0.0", + "source": { + "type": "ABAP", + "connection": "${abap_connection}", + "properties": { + "extractionType": "CDS", + "packageSize": 50000 + } + }, + "target": { + "type": "HANA", + "connection": "${hana_connection}", + "properties": { + "schema": "${target_schema}", + "writeMode": "UPSERT", + "batchSize": 10000 + } + }, + "tasks": [ + { + "name": "customer_master", + "source": { + "object": "I_Customer", + "type": "CDS_VIEW", + "filter": "" + }, + "target": { + "table": "CUSTOMER_MASTER", + "keyColumns": ["Customer"] + }, + "mapping": { + "mode": "auto", + "customMappings": [] + }, + "loadType": { + "initial": true, + "delta": true + } + }, + { + "name": "sales_orders", + "source": { + "object": "I_SalesOrder", + "type": "CDS_VIEW", + "filter": "CreationDate ge datetime'2024-01-01T00:00:00'" + }, + "target": { + "table": "SALES_ORDERS", + "keyColumns": ["SalesOrder"] + }, + "mapping": { + "mode": "auto", + "customMappings": [ + { + "source": "SalesOrder", + "target": "SALES_ORDER_ID" + }, + { + "source": "SoldToParty", + "target": "CUSTOMER_ID" + } + ] + }, + "loadType": { + "initial": true, + "delta": true + } + } + ], + "schedule": { + "enabled": false, + "cron": "0 0 * * * *", + "timezone": "UTC" + }, + "settings": { + "parallelTasks": 4, + "errorHandling": "CONTINUE", + "logging": "INFO" + }, + "notes": [ + "Configure ABAP and HANA connections before deployment.", + "Modify CDS view names for your specific data.", + "Adjust filters based on data volume requirements.", + "Enable schedule after successful initial load.", + "Uses UPSERT for exactly-once delivery semantics." + ] +}