-
Notifications
You must be signed in to change notification settings - Fork 8
R528 kfp support for accelerator devices #16
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Closed
RobuRishabh
wants to merge
6
commits into
opendatahub-io:main
from
RobuRishabh:R528-Kfp-support-for-accelerator-devices
Closed
Changes from all commits
Commits
Show all changes
6 commits
Select commit
Hold shift + click to select a range
c972632
Refactored Common Components into a shared module
RobuRishabh 2ce8ba5
renamed common_components to common
RobuRishabh 3205bc3
Added R528-Docling KFP support for accelerator devices
RobuRishabh a2c25e7
Revert "Added R528-Docling KFP support for accelerator devices"
RobuRishabh 4783d93
R1065: Complete refactoring with latest main changes
RobuRishabh 14f35cf
R528: Added support for accelerator devices and updated the Readme
RobuRishabh File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,32 @@ | ||
| """ | ||
| Common components for Docling Kubeflow Pipelines. | ||
|
|
||
| This module contains shared components that are used across different | ||
| Docling pipeline implementations (standard and VLM). | ||
| """ | ||
|
|
||
| # Import all common components to make them easily accessible | ||
| from .components import ( | ||
| import_pdfs, | ||
| create_pdf_splits, | ||
| download_docling_models, | ||
| ) | ||
|
|
||
| from .constants import ( | ||
| PYTHON_BASE_IMAGE, | ||
| DOCLING_BASE_IMAGE, | ||
| MODEL_TYPE_STANDARD, | ||
| MODEL_TYPE_VLM, | ||
| MODEL_TYPE_VLM_REMOTE, | ||
| ) | ||
|
|
||
| __all__ = [ | ||
| "import_pdfs", | ||
| "create_pdf_splits", | ||
| "download_docling_models", | ||
| "PYTHON_BASE_IMAGE", | ||
| "DOCLING_BASE_IMAGE", | ||
| "MODEL_TYPE_STANDARD", | ||
| "MODEL_TYPE_VLM", | ||
| "MODEL_TYPE_VLM_REMOTE", | ||
| ] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,212 @@ | ||
| from typing import List | ||
| from kfp import dsl | ||
| from .constants import PYTHON_BASE_IMAGE, DOCLING_BASE_IMAGE | ||
|
|
||
| @dsl.component( | ||
| base_image=PYTHON_BASE_IMAGE, | ||
| packages_to_install=["boto3", "requests"], # Required for S3 and HTTP downloads | ||
| ) | ||
| def import_pdfs( | ||
| output_path: dsl.Output[dsl.Artifact], | ||
| filenames: str, | ||
| base_url: str, | ||
| from_s3: bool = False, | ||
| s3_secret_mount_path: str = "/mnt/secrets", | ||
| ): | ||
| """ | ||
| Import PDF filenames (comma-separated) from specified URL or S3 bucket. | ||
|
|
||
| Args: | ||
| filenames: List of PDF filenames to import. | ||
| base_url: Base URL of the PDF files. | ||
| output_path: Path to the output directory for the PDF files. | ||
| from_s3: Whether or not to import from S3. | ||
| s3_secret_mount_path: Path to the secret mount path for the S3 credentials. | ||
| """ | ||
| import boto3 # pylint: disable=import-outside-toplevel | ||
| import os # pylint: disable=import-outside-toplevel | ||
| from pathlib import Path # pylint: disable=import-outside-toplevel | ||
| import requests # pylint: disable=import-outside-toplevel | ||
|
|
||
| filenames_list = [name.strip() for name in filenames.split(",") if name.strip()] | ||
| if not filenames_list: | ||
| raise ValueError("filenames must contain at least one filename (comma-separated)") | ||
|
|
||
| output_path_p = Path(output_path.path) | ||
| output_path_p.mkdir(parents=True, exist_ok=True) | ||
|
|
||
| if from_s3: | ||
| if not os.path.exists(s3_secret_mount_path): | ||
| raise ValueError(f"Secret for S3 should be mounted in {s3_secret_mount_path}") | ||
|
|
||
| s3_endpoint_url_secret = "S3_ENDPOINT_URL" | ||
| s3_endpoint_url_file_path = os.path.join(s3_secret_mount_path, s3_endpoint_url_secret) | ||
| if os.path.isfile(s3_endpoint_url_file_path): | ||
| with open(s3_endpoint_url_file_path) as f: | ||
| s3_endpoint_url = f.read() | ||
| else: | ||
| raise ValueError(f"Key {s3_endpoint_url_secret} not defined in secret {s3_secret_mount_path}") | ||
|
|
||
| s3_access_key_secret = "S3_ACCESS_KEY" | ||
| s3_access_key_file_path = os.path.join(s3_secret_mount_path, s3_access_key_secret) | ||
| if os.path.isfile(s3_access_key_file_path): | ||
| with open(s3_access_key_file_path) as f: | ||
| s3_access_key = f.read() | ||
| else: | ||
| raise ValueError(f"Key {s3_access_key_secret} not defined in secret {s3_secret_mount_path}") | ||
|
|
||
| s3_secret_key_secret = "S3_SECRET_KEY" | ||
| s3_secret_key_file_path = os.path.join(s3_secret_mount_path, s3_secret_key_secret) | ||
| if os.path.isfile(s3_secret_key_file_path): | ||
| with open(s3_secret_key_file_path) as f: | ||
| s3_secret_key = f.read() | ||
| else: | ||
| raise ValueError(f"Key {s3_secret_key_secret} not defined in secret {s3_secret_mount_path}") | ||
|
|
||
| s3_bucket_secret = "S3_BUCKET" | ||
| s3_bucket_file_path = os.path.join(s3_secret_mount_path, s3_bucket_secret) | ||
| if os.path.isfile(s3_bucket_file_path): | ||
| with open(s3_bucket_file_path) as f: | ||
| s3_bucket = f.read() | ||
| else: | ||
| raise ValueError(f"Key {s3_bucket_secret} not defined in secret {s3_secret_mount_path}") | ||
|
|
||
| s3_prefix_secret = "S3_PREFIX" | ||
| s3_prefix_file_path = os.path.join(s3_secret_mount_path, s3_prefix_secret) | ||
| if os.path.isfile(s3_prefix_file_path): | ||
| with open(s3_prefix_file_path) as f: | ||
| s3_prefix = f.read() | ||
| else: | ||
| raise ValueError(f"Key {s3_prefix_secret} not defined in secret {s3_secret_mount_path}") | ||
|
|
||
| if not s3_endpoint_url: | ||
| raise ValueError("S3_ENDPOINT_URL must be provided") | ||
|
|
||
| if not s3_bucket: | ||
| raise ValueError("S3_BUCKET must be provided") | ||
|
|
||
| s3_client = boto3.client( | ||
| 's3', | ||
| endpoint_url=s3_endpoint_url, | ||
| aws_access_key_id=s3_access_key, | ||
| aws_secret_access_key=s3_secret_key, | ||
| ) | ||
|
|
||
| for filename in filenames_list: | ||
| orig = f"{s3_prefix.rstrip('/')}/{filename.lstrip('/')}" | ||
| dest = output_path_p / filename | ||
| print(f"import-test-pdfs: downloading {orig} -> {dest} from s3", flush=True) | ||
| s3_client.download_file(s3_bucket, orig, dest) | ||
| else: | ||
| if not base_url: | ||
| raise ValueError("base_url must be provided") | ||
|
|
||
| for filename in filenames_list: | ||
| url = f"{base_url.rstrip('/')}/{filename.lstrip('/')}" | ||
| dest = output_path_p / filename | ||
| print(f"import-test-pdfs: downloading {url} -> {dest}", flush=True) | ||
| with requests.get(url, stream=True, timeout=30) as resp: | ||
| resp.raise_for_status() | ||
| with dest.open("wb") as f: | ||
| for chunk in resp.iter_content(chunk_size=8192): | ||
| if chunk: | ||
| f.write(chunk) | ||
|
|
||
| print("import-test-pdfs: done", flush=True) | ||
|
|
||
| @dsl.component( | ||
| base_image=PYTHON_BASE_IMAGE, | ||
| ) | ||
| def create_pdf_splits( | ||
| input_path: dsl.Input[dsl.Artifact], | ||
| num_splits: int, | ||
| ) -> List[List[str]]: | ||
| """ | ||
| Create a list of PDF splits. | ||
|
|
||
| Args: | ||
| input_path: Path to the input directory containing PDF files. | ||
| num_splits: Number of splits to create. | ||
| """ | ||
| from pathlib import Path # pylint: disable=import-outside-toplevel | ||
|
|
||
| input_path_p = Path(input_path.path) | ||
|
|
||
| all_pdfs = [path.name for path in input_path_p.glob("*.pdf")] | ||
| all_splits = [all_pdfs[i::num_splits] for i in range(num_splits)] | ||
| filled_splits = list(filter(None, all_splits)) | ||
| return filled_splits | ||
|
|
||
| @dsl.component( | ||
| base_image=DOCLING_BASE_IMAGE, | ||
| ) | ||
| def download_docling_models( | ||
| output_path: dsl.Output[dsl.Artifact], | ||
| pipeline_type: str = "standard", | ||
| remote_model_endpoint_enabled: bool = False, | ||
| ): | ||
| """ | ||
| Download Docling models based on pipeline type and configuration. | ||
|
|
||
| This unified component handles model downloading for different pipeline types: | ||
| - standard : Download traditional Docling models (layout, tableformer, easyocr) | ||
| - vlm : Download Docling VLM models (smolvlm, smoldocling) for local inference | ||
| - vlm-remote : Download Docling VLM models for remote inference | ||
|
|
||
| Args: | ||
| output_path: Path to the output directory for Docling models | ||
| pipeline_type: Type of pipeline (standard, vlm) | ||
| remote_model_endpoint_enabled: Whether to download remote model endpoint models (VLM only) | ||
| """ | ||
| from pathlib import Path # pylint: disable=import-outside-toplevel | ||
| from docling.utils.model_downloader import download_models # pylint: disable=import-outside-toplevel | ||
|
|
||
| output_path_p = Path(output_path.path) | ||
| output_path_p.mkdir(parents=True, exist_ok=True) | ||
|
|
||
| if pipeline_type == "standard": | ||
| # Standard pipeline: download traditional models | ||
| download_models( | ||
| output_dir=output_path_p, | ||
| progress=True, | ||
| with_layout=True, | ||
| with_tableformer=True, | ||
| with_easyocr=False, | ||
| ) | ||
| elif pipeline_type == "vlm" and remote_model_endpoint_enabled: | ||
| # VLM pipeline with remote model endpoint: Download minimal required models | ||
| # Only models set are what lives in fabianofranz repo | ||
| # TODO: figure out what needs to be downloaded or removed | ||
| download_models( | ||
| output_dir=output_path_p, | ||
| progress=False, | ||
| force=False, | ||
| with_layout=True, | ||
| with_tableformer=True, | ||
| with_code_formula=False, | ||
| with_picture_classifier=False, | ||
| with_smolvlm=False, | ||
| with_smoldocling=False, | ||
| with_smoldocling_mlx=False, | ||
| with_granite_vision=False, | ||
| with_easyocr=False, | ||
| ) | ||
| elif pipeline_type == "vlm": | ||
| # VLM pipeline with local models: Download VLM models for local inference | ||
| # TODO: set models downloaded by model name passed into KFP pipeline ex: smoldocling OR granite-vision | ||
| download_models( | ||
| output_dir=output_path_p, | ||
| with_smolvlm=True, | ||
| with_smoldocling=True, | ||
| progress=False, | ||
| force=False, | ||
| with_layout=False, | ||
| with_tableformer=False, | ||
| with_code_formula=False, | ||
| with_picture_classifier=False, | ||
| with_smoldocling_mlx=False, | ||
| with_granite_vision=False, | ||
| with_easyocr=False, | ||
| ) | ||
| else: | ||
| raise ValueError(f"Invalid pipeline_type: {pipeline_type}. Must be 'standard' or 'vlm'") | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,8 @@ | ||
| # Base container images used across all Docling Kubeflow Pipelines | ||
| PYTHON_BASE_IMAGE = "registry.access.redhat.com/ubi9/python-311:9.6-1755074620" | ||
| DOCLING_BASE_IMAGE = "quay.io/fabianofranz/docling-ubi9:2.54.0" | ||
|
|
||
| # Model types for download_docling_models component | ||
| MODEL_TYPE_STANDARD = "standard" | ||
| MODEL_TYPE_VLM = "vlm" | ||
| MODEL_TYPE_VLM_REMOTE = "vlm-remote" |
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Handle empty/whitespace S3 prefixes before building the object key
If
S3_PREFIXis left blank (quite common when pulling from the bucket root), the current expression buildsorig = "/<filename>", which does not match the actual key and the download fails. Newlines from the secret files also bleed into the key and credentials because we never strip them. Please trim the secret contents and only prepend the prefix when it’s non-empty.📝 Committable suggestion
🧰 Tools
🪛 Ruff (0.13.2)
74-74: Possible hardcoded password assigned to: "s3_prefix_secret"
(S105)
80-80: Avoid specifying long messages outside the exception class
(TRY003)
83-83: Avoid specifying long messages outside the exception class
(TRY003)
86-86: Avoid specifying long messages outside the exception class
(TRY003)
102-102: Avoid specifying long messages outside the exception class
(TRY003)
🤖 Prompt for AI Agents