Skip to content

Commit af29cd9

Browse files
authored
Merge branch 'main' into linker-backend-classmethod-714
2 parents 12d291b + 371fa42 commit af29cd9

137 files changed

Lines changed: 5263 additions & 2141 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

‎.gitattributes‎

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
*.h binary
66
*.hpp binary
77
# Exception: headers we own
8+
benchmarks/cuda_bindings/benchmarks/cpp/*.hpp -binary text diff
89
cuda_bindings/cuda/bindings/_bindings/*.h -binary text diff
910
cuda_bindings/cuda/bindings/_lib/*.h -binary text diff
1011
cuda_core/cuda/core/_cpp/*.h -binary text diff

‎.github/workflows/ci.yml‎

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -343,7 +343,7 @@ jobs:
343343
build-type: pull-request
344344
host-platform: ${{ matrix.host-platform }}
345345
build-ctk-ver: ${{ needs.ci-vars.outputs.CUDA_BUILD_VER }}
346-
nruns: ${{ (github.event_name == 'schedule' && 100) || 1}}
346+
nruns: ${{ (github.event_name == 'schedule' && 5) || 1}}
347347
skip-bindings-test: ${{ !fromJSON(needs.detect-changes.outputs.test_bindings) }}
348348

349349
# See test-linux-64 for why test jobs are split by platform.
@@ -368,7 +368,7 @@ jobs:
368368
build-type: pull-request
369369
host-platform: ${{ matrix.host-platform }}
370370
build-ctk-ver: ${{ needs.ci-vars.outputs.CUDA_BUILD_VER }}
371-
nruns: ${{ (github.event_name == 'schedule' && 100) || 1}}
371+
nruns: ${{ (github.event_name == 'schedule' && 5) || 1}}
372372
skip-bindings-test: ${{ !fromJSON(needs.detect-changes.outputs.test_bindings) }}
373373

374374
# See test-linux-64 for why test jobs are split by platform.
@@ -393,7 +393,7 @@ jobs:
393393
build-type: pull-request
394394
host-platform: ${{ matrix.host-platform }}
395395
build-ctk-ver: ${{ needs.ci-vars.outputs.CUDA_BUILD_VER }}
396-
nruns: ${{ (github.event_name == 'schedule' && 100) || 1}}
396+
nruns: ${{ (github.event_name == 'schedule' && 5) || 1}}
397397
skip-bindings-test: ${{ !fromJSON(needs.detect-changes.outputs.test_bindings) }}
398398

399399
doc:

‎.github/workflows/pr-auto-label.yml‎

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,4 +20,4 @@ jobs:
2020
pull-requests: write
2121
steps:
2222
- name: Apply labels
23-
uses: actions/labeler@634933edcd8ababfe52f92936142cc22ac488b1b # v6.0.1
23+
uses: actions/labeler@e52e4fb63ed5cd0e07abaad9826b2a893ccb921f # main (include actions/labeler#917)

‎.github/workflows/release.yml‎

Lines changed: 29 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@ on:
2020
- cuda-bindings
2121
- cuda-pathfinder
2222
- cuda-python
23-
- all
2423
git-tag:
2524
description: "The release git tag"
2625
required: true
@@ -89,6 +88,30 @@ jobs:
8988
gh release create "${{ inputs.git-tag }}" --draft --repo "${{ github.repository }}" --title "Release ${{ inputs.git-tag }}" --notes "Release ${{ inputs.git-tag }}"
9089
fi
9190
91+
check-release-notes:
92+
runs-on: ubuntu-latest
93+
steps:
94+
- name: Checkout Source
95+
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
96+
with:
97+
ref: ${{ inputs.git-tag }}
98+
99+
- name: Set up Python
100+
uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
101+
with:
102+
python-version: "3.12"
103+
104+
- name: Self-test release-notes checker
105+
run: |
106+
pip install pytest
107+
pytest ci/tools/tests
108+
109+
- name: Check versioned release notes exist
110+
run: |
111+
python ci/tools/check_release_notes.py \
112+
--git-tag "${{ inputs.git-tag }}" \
113+
--component "${{ inputs.component }}"
114+
92115
doc:
93116
name: Build release docs
94117
if: ${{ github.repository_owner == 'nvidia' }}
@@ -99,6 +122,7 @@ jobs:
99122
pull-requests: write
100123
needs:
101124
- check-tag
125+
- check-release-notes
102126
- determine-run-id
103127
secrets: inherit
104128
uses: ./.github/workflows/build-docs.yml
@@ -114,6 +138,7 @@ jobs:
114138
contents: write
115139
needs:
116140
- check-tag
141+
- check-release-notes
117142
- determine-run-id
118143
- doc
119144
secrets: inherit
@@ -128,11 +153,12 @@ jobs:
128153
runs-on: ubuntu-latest
129154
needs:
130155
- check-tag
156+
- check-release-notes
131157
- determine-run-id
132158
- doc
133159
environment:
134160
name: testpypi
135-
url: https://test.pypi.org/${{ inputs.component != 'all' && format('p/{0}/', inputs.component) || '' }}
161+
url: https://test.pypi.org/p/${{ inputs.component }}/
136162
permissions:
137163
id-token: write
138164
steps:
@@ -162,7 +188,7 @@ jobs:
162188
- publish-testpypi
163189
environment:
164190
name: pypi
165-
url: https://pypi.org/${{ inputs.component != 'all' && format('p/{0}/', inputs.component) || '' }}
191+
url: https://pypi.org/p/${{ inputs.component }}/
166192
permissions:
167193
id-token: write
168194
steps:

‎.github/workflows/test-wheel-linux.yml‎

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -275,7 +275,7 @@ jobs:
275275
run: |
276276
pip install pyperf
277277
pushd benchmarks/cuda_bindings
278-
python run_pyperf.py --fast --min-time 1
278+
python run_pyperf.py --debug-single-value
279279
popd
280280
281281
- name: Run cuda.core tests

‎.gitignore‎

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,6 @@ cuda_bindings/cuda/bindings/_bindings/cyruntime.pxd
2626
cuda_bindings/cuda/bindings/_bindings/cyruntime.pyx
2727
cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pxd
2828
cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pyx
29-
cuda_bindings/cuda/bindings/_bindings/cynvrtc.pxd
30-
cuda_bindings/cuda/bindings/_bindings/cynvrtc.pyx
3129
cuda_bindings/cuda/bindings/_internal/_nvml.pyx
3230
cuda_bindings/cuda/bindings/_internal/cufile.pyx
3331
cuda_bindings/cuda/bindings/_internal/nvfatbin.pyx
@@ -42,14 +40,10 @@ cuda_bindings/cuda/bindings/cyruntime.pxd
4240
cuda_bindings/cuda/bindings/cyruntime.pyx
4341
cuda_bindings/cuda/bindings/cyruntime_functions.pxi
4442
cuda_bindings/cuda/bindings/cyruntime_types.pxi
45-
cuda_bindings/cuda/bindings/cynvrtc.pxd
46-
cuda_bindings/cuda/bindings/cynvrtc.pyx
4743
cuda_bindings/cuda/bindings/driver.pxd
4844
cuda_bindings/cuda/bindings/driver.pyx
4945
cuda_bindings/cuda/bindings/runtime.pxd
5046
cuda_bindings/cuda/bindings/runtime.pyx
51-
cuda_bindings/cuda/bindings/nvrtc.pxd
52-
cuda_bindings/cuda/bindings/nvrtc.pyx
5347
cuda_bindings/cuda/bindings/utils/_get_handle.pyx
5448

5549
# Version files from setuptools_scm

‎.spdx-ignore‎

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,5 +10,7 @@ cuda_bindings/examples/*
1010

1111
# Vendored
1212
cuda_core/cuda/core/_include/dlpack.h
13+
cuda_core/cuda/core/_include/aoti_shim.h
14+
cuda_core/cuda/core/_include/aoti_shim.def
1315

1416
qa/ctk-next.drawio.svg

‎benchmarks/cuda_bindings/README.md‎

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,13 @@ Driver APIs through cuda.bindings, relative to a similar C++ baseline.
66
The goal is to benchmark how much overhead does the Python layer adds to calling
77
CUDA APIs and what operations are not in our target of less than 1us of overhead.
88

9-
Each Python benchmark has a C++ counterpart, which is used to compare the
10-
operations. We try to make each implementation perform small operations
11-
and nearly the same work as possible and are run under similar conditions.
9+
Most Python benchmarks have a C++ counterpart that is used as a comparative
10+
baseline. We try to make each implementation perform small operations and
11+
nearly the same work as possible and are run under similar conditions.
12+
13+
A few benchmarks (e.g. in `bench_enum.py`) are intentionally Python-only
14+
because they measure costs with no direct C++ equivalent — such as enum
15+
construction and member access on `cuda.bindings` enum classes.
1216

1317
These are **not** throughput benchmarks to measure the overall performance
1418
of kernels and applications.
@@ -47,12 +51,14 @@ To run the benchmarks combine the environment and task:
4751
```bash
4852
# Run the Python benchmarks in the wheel environment
4953
pixi run -e wheel bench
54+
pixi run -e wheel bench --min-time 0.1
5055

5156
# Run the Python benchmarks in the source environment
5257
pixi run -e source bench
5358

5459
# Run the C++ benchmarks
5560
pixi run -e wheel bench-cpp
61+
pixi run -e wheel bench-cpp --min-time 0.1
5662
```
5763

5864
Both runners automatically save results to JSON files in the benchmarks

‎benchmarks/cuda_bindings/benchmarks/bench_ctx_device.py‎

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,11 @@
1313
_, DEVICE = cuda.cuDeviceGet(0)
1414
ATTRIBUTE = cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR
1515

16+
# Outer retain so the benchmarked retain/release pair just bumps the refcount.
17+
_err, _PRIMARY_CTX = cuda.cuDevicePrimaryCtxRetain(DEVICE)
18+
if _err != cuda.CUresult.CUDA_SUCCESS:
19+
raise RuntimeError(f"cuDevicePrimaryCtxRetain failed during setup: {_err}")
20+
1621

1722
def bench_ctx_get_current(loops: int) -> float:
1823
_fn = cuda.cuCtxGetCurrent
@@ -60,3 +65,15 @@ def bench_device_get_attribute(loops: int) -> float:
6065
for _ in range(loops):
6166
_fn(_attr, _dev)
6267
return time.perf_counter() - t0
68+
69+
70+
def bench_device_primary_ctx_retain(loops: int) -> float:
71+
_retain = cuda.cuDevicePrimaryCtxRetain
72+
_release = cuda.cuDevicePrimaryCtxRelease
73+
_dev = DEVICE
74+
75+
t0 = time.perf_counter()
76+
for _ in range(loops):
77+
_retain(_dev)
78+
_release(_dev)
79+
return time.perf_counter() - t0
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
import time
6+
7+
from cuda.bindings import driver as cuda
8+
9+
10+
def bench_curesult_construction(loops: int) -> float:
11+
_cls = cuda.CUresult
12+
13+
t0 = time.perf_counter()
14+
for _ in range(loops):
15+
_cls(0)
16+
return time.perf_counter() - t0
17+
18+
19+
def bench_curesult_member_access(loops: int) -> float:
20+
_cls = cuda.CUresult
21+
22+
t0 = time.perf_counter()
23+
for _ in range(loops):
24+
_cls.CUDA_SUCCESS # noqa: B018
25+
return time.perf_counter() - t0
26+
27+
28+
def bench_device_attribute_construction(loops: int) -> float:
29+
_cls = cuda.CUdevice_attribute
30+
31+
t0 = time.perf_counter()
32+
for _ in range(loops):
33+
_cls(1)
34+
return time.perf_counter() - t0

0 commit comments

Comments
 (0)