PaddlePaddle
diff --git a/‎.github/workflows/metax_work.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/metax_work.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/metax_work_private.yaml‎
Lines changed: 98 additions & 0 deletions b/‎.github/workflows/metax_work_private.yaml‎
Lines changed: 98 additions & 0 deletions
diff --git a/‎backends/metax_gpu/build_private_CI.sh‎
Lines changed: 91 additions & 0 deletions b/‎backends/metax_gpu/build_private_CI.sh‎
Lines changed: 91 additions & 0 deletions
diff --git a/‎backends/metax_gpu/kernels/cuda_kernels/uniform_kernel_register.cu‎
Lines changed: 5 additions & 2 deletions b/‎backends/metax_gpu/kernels/cuda_kernels/uniform_kernel_register.cu‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎backends/metax_gpu/patch/paddle.patch‎
Lines changed: 10 additions & 96 deletions b/‎backends/metax_gpu/patch/paddle.patch‎
Lines changed: 10 additions & 96 deletions
@@ -78,7 +78,7 @@ jobs:
 
         run: |
           cd backends/metax_gpu/tests
-          bash run_test.sh -j 16
+          bash run_test.sh -j 8
 
       - name: push whl
         env:
 
@@ -0,0 +1,98 @@
+name: paddle metax gpu private test
+
+on:
+  workflow_dispatch:
+  pull_request:
+    types: [opened, synchronize]
+    branches: [develop, release/**]
+    schedule:
+      - cron: "0 15 * * *"
+permissions: read-all
+
+defaults:
+  run:
+    shell: bash
+
+jobs:
+  metax-gpu-test:
+    runs-on: paddle-metax-runner-set
+    # runs-on: debug-paddle-runner-set
+    steps:
+      - name: Checkout repository
+        run: |
+          git config --global user.name "GitHub Actions"
+          git config --global user.email "[email protected]"
+
+          git clone \
+            --reference-if-able /home/runner/PaddleCustomDevice \
+            --depth=1 \
+            --shallow-submodules \
+            --jobs=8 \
+            --branch ${{ github.base_ref || github.ref_name}} \
+            --recurse-submodules \
+            https://${{ github.actor }}:${{ secrets.GITHUB_TOKEN }}@github.com/${{ github.repository }}.git .
+
+          if [ "${{ github.event_name }}" == "pull_request" ]; then
+            git fetch origin pull/${{ github.event.pull_request.number }}/head:pull/${{ github.event.pull_request.number }}/head
+            git checkout pull/${{ github.event.pull_request.number }}/head
+
+
+
+
+            paddle_branch=${{ github.base_ref || github.ref_name}}
+            echo $paddle_branch
+            # sleep 10000
+            change_numbers=$(git diff --name-only remotes/origin/${paddle_branch} | wc -l)
+            echo $change_numbers
+
+
+            change_backend=$(git diff --name-only remotes/origin/${paddle_branch} | grep -c "backends/" || true)
+            echo $change_backend
+            change_metax_only=$(git diff --name-only remotes/origin/${paddle_branch} | grep -c "backends/metax_gpu" || true)
+            echo $change_metax_only
+
+            # change_backend=$(git diff --name-only remotes/origin/${paddle_branch} | grep "backends/"| wc -l)
+            # echo $change_backend
+            # change_metax_only=$(git diff --name-only remotes/origin/${paddle_branch} | grep "backends/metax_gpu"| wc -l)
+            # echo $change_metax_only
+
+            git diff --name-only remotes/origin/${paddle_branch}
+
+            if [ $change_numbers -ne $change_backend ]; then
+              echo "Common file changed, continue to run metax FULL CI test ..."
+            elif [ $paddle_branch -eq 0 ] ; then
+              echo "NO metax backend changes found, skip metax FULL CI ....."
+              exit 0
+            fi
+
+
+            # git submodule update --init --recursive
+          fi
+
+
+      - name: compile
+        run: |
+          # sleep 10000
+          cd backends/metax_gpu
+          bash build_private_CI.sh
+
+      - name: run test
+
+        run: |
+          cd backends/metax_gpu/tests
+          bash run_test.sh -j 8
+
+      - name: push whl
+        env:
+          PR_ID: ${{ github.event.pull_request.number }}
+          COMMIT_ID: ${{ github.event.pull_request.head.sha }}
+        run: |
+          pip install bce-python-sdk==0.8.74
+          export AK=paddle
+          export SK=paddle
+          if [ ! -f "BosClient.py}" ]; then
+            wget -q --no-proxy https://xly-devops.bj.bcebos.com/home/bos_retry.tar.gz --no-check-certificate
+            tar xf bos_retry.tar.gz
+          fi
+          cp backends/metax_gpu/build/dist/paddle_metax_gpu*.whl .
+          python BosClient.py paddle_metax_gpu*.whl paddle-github-action/PaddleCustomDevice/metax_gpu/${PR_ID}/${COMMIT_ID}
@@ -0,0 +1,91 @@
+# 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights Reserved.
+#!/bin/bash
+
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+# uninstall paddle
+pip  uninstall paddlepaddle -y
+
+
+#!/bin/bash
+
+# update_paddle_dev.sh
+
+chown -R $USER:$USER ../../Paddle/
+chown -R $USER:$USER ../../../PaddleCustomDevice/
+# Step 1: 撤销所有本地修改（已跟踪的文件，不包括新文件）
+cd ../../Paddle/
+echo "🔄 正在撤销所有本地修改（git checkout .）..."
+git checkout develop
+git checkout .
+
+# Step 2: 拉取远程最新的 dev (通常是 develop) 分支代码
+echo "🌐 正在拉取远程最新的 dev (develop) 分支代码..."
+
+
+# 拉取 develop 分支的最新代码（与远程同步）
+git pull origin develop
+
+echo "🔗 当前分支: $(git branch --show-current)"
+echo "📌 最新 commit hash (短): $(git rev-parse --short HEAD)"
+echo "📌 最新 commit 信息:"
+git log -1 --oneline
+
+# 提示完成
+echo "✅ 脚本执行完毕！"
+echo "📌 已撤销本地修改，并更新到 Paddle 最新的 develop (dev) 分支代码。"
+
+
+pip install parameterized safetensors==0.6.2 -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple some-package
+# install paddle
+
+python -m pip install --pre paddlepaddle -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/
+
+
+# unset http_proxy https_proxy
+cd -
+# apply patch
+bash change_patch.sh
+
+export MACA_PATH=/opt/maca
+export CUDA_PATH=/workspace/cuda-11.7/
+export PATH=${CUDA_PATH}/bin:${PATH}
+export CUCC_PATH=${MACA_PATH}/tools/cu-bridge
+export PATH=${PATH}:${CUCC_PATH}/tools:${CUCC_PATH}/bin
+export PATH=${MACA_PATH}/bin:${PATH}
+export LD_LIBRARY_PATH=${MACA_PATH}/lib:${MACA_PATH}/mxgpu_llvm/lib:${LD_LIBRARY_PATH}
+export PADDLE_VERSION=3.3.0
+
+if [ ! -d build ]; then
+    echo "build directory not found, creating..."
+    mkdir build
+fi
+
+echo "make_maca"
+cd build
+cmake_maca .. -DCMAKE_BUILD_TYPE=Release -DPython3_EXECUTABLE=$(which python3) -DWITH_GPU=ON
+make_maca -j60
+
+echo "install whl"
+pip install dist/paddle_metax_gpu*.whl --force-reinstall
+cd ..
+echo "Done!"
+
+cd build/dist/
+ossutil ls oss://opensource-ci/paddle/
+ossutil cat oss://opensource-ci/paddle/test1
+ossutil cp ./paddle_metax_gpu-*.whl oss://opensource-ci/paddle/test1/
+cd -
@@ -21,5 +21,8 @@ PD_CUSTOM_KERNEL_REGISTER(uniform,
                           phi::UniformKernel,
                           float,
                           double,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16) {}
+                          phi::float16,
+                          phi::bfloat16,
+                          phi::float8_e4m3fn,
+                          phi::complex64,
+                          phi::complex128) {}
@@ -48,7 +48,7 @@ index bff0f2bf70..9376b5781f 100644
  #include "paddle/phi/core/platform/device/gpu/gpu_info.h"
  #include "paddle/phi/core/platform/profiler/utils.h"
 diff --git a/paddle/phi/backends/dynload/cublas.h b/paddle/phi/backends/dynload/cublas.h
-index 62beb53cfe..0b0ac09fc0 100644
+index bda9cbe17e..c73eba9c8a 100644
 --- a/paddle/phi/backends/dynload/cublas.h
 +++ b/paddle/phi/backends/dynload/cublas.h
@@ -49,7 +49,12 @@ extern void *cublas_dso_handle;
@@ -98,107 +98,21 @@ index 8b2e08c777..ca926df151 100644
  #define CUBLASLT_BLAS_ROUTINE_EACH(__macro)      \
    __macro(cublasLtCreate);                       \
 diff --git a/paddle/phi/backends/dynload/cudnn.h b/paddle/phi/backends/dynload/cudnn.h
-index c0080f0a5e..458ca3e2e8 100644
+index a943bbed9a..af931490e3 100644
 --- a/paddle/phi/backends/dynload/cudnn.h
 +++ b/paddle/phi/backends/dynload/cudnn.h
-@@ -38,7 +38,9 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
+@@ -38,7 +38,10 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
          cudnn_dso_handle = phi::dynload::GetCUDNNDsoHandle();        \
        });                                                            \
        EnforceCUDNNLoaded(#__name);                                   \
 -      static void* p_##__name = dlsym(cudnn_dso_handle, #__name);    \
 +      std::string replaced_name = #__name;                                  \
-+      replaced_name =  replaced_name.replace(0,2,"mc");          \
-+      static void* p_##__name = dlsym(cudnn_dso_handle, replaced_name.c_str());    \
++      replaced_name = replaced_name.replace(0, 2, "mc");                    \
++      static void* p_##__name =                                             \
++          dlsym(cudnn_dso_handle, replaced_name.c_str());                \
        return reinterpret_cast<cudnn_func>(p_##__name)(args...);      \
      }                                                                \
    };                                                                 \
-@@ -49,7 +51,6 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
-  * different cudnn version has different interfaces
-  **/
- #define CUDNN_DNN_ROUTINE_EACH(__macro)                    \
--  __macro(cudnnSetCallback);                               \
-   __macro(cudnnSetTensor4dDescriptor);                     \
-   __macro(cudnnSetTensor4dDescriptorEx);                   \
-   __macro(cudnnSetTensorNdDescriptor);                     \
-@@ -104,6 +105,13 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
-   __macro(cudnnSetDropoutDescriptor);                      \
-   __macro(cudnnRestoreDropoutDescriptor);                  \
-   __macro(cudnnCreateRNNDescriptor);                       \
-+  __macro(cudnnGetRNNParamsSize);                          \
-+  __macro(cudnnGetRNNWorkspaceSize);                       \
-+  __macro(cudnnGetRNNTrainingReserveSize);                 \
-+  __macro(cudnnRNNForwardTraining);                        \
-+  __macro(cudnnRNNBackwardData);                           \
-+  __macro(cudnnRNNBackwardWeights);                        \
-+  __macro(cudnnRNNForwardInference);                       \
-   __macro(cudnnDestroyDropoutDescriptor);                  \
-   __macro(cudnnDestroyRNNDescriptor);                      \
-   __macro(cudnnSetTensorNdDescriptorEx);                   \
-@@ -118,7 +126,8 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
-   __macro(cudnnCreateActivationDescriptor);                \
-   __macro(cudnnSetActivationDescriptor);                   \
-   __macro(cudnnGetActivationDescriptor);                   \
--  __macro(cudnnDestroyActivationDescriptor);
-+  __macro(cudnnDestroyActivationDescriptor);               \
-+  __macro(cudnnSetRNNDescriptor_v6);
- CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
- 
- #if CUDNN_VERSION >= 7000 && CUDNN_VERSION < 8000
-@@ -152,7 +161,12 @@ CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
- #define CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(__macro) \
-   __macro(cudnnCreateRNNDataDescriptor);             \
-   __macro(cudnnDestroyRNNDataDescriptor);            \
--  __macro(cudnnSetRNNDataDescriptor);
-+  __macro(cudnnSetRNNDataDescriptor);                \
-+  __macro(cudnnSetRNNPaddingMode);                   \
-+  __macro(cudnnRNNForwardTrainingEx);                \
-+  __macro(cudnnRNNBackwardDataEx);                   \
-+  __macro(cudnnRNNBackwardWeightsEx);                \
-+  __macro(cudnnRNNForwardInferenceEx);
- CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
- #endif
- 
-@@ -195,40 +209,6 @@ CUDNN_DNN_ROUTINE_EACH_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
- CUDNN_DNN_ROUTINE_EACH_FRONTEND(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
- #endif
- 
--#if CUDNN_VERSION < 90000
--#define CUDNN_DNN_ROUTINE_EACH_REMOVED_IN_E9(__macro) \
--  __macro(cudnnGetRNNParamsSize);                     \
--  __macro(cudnnGetRNNWorkspaceSize);                  \
--  __macro(cudnnGetRNNTrainingReserveSize);            \
--  __macro(cudnnSetRNNDescriptor_v6);                  \
--  __macro(cudnnRNNForwardInference);                  \
--  __macro(cudnnRNNForwardTraining);                   \
--  __macro(cudnnRNNBackwardData);                      \
--  __macro(cudnnRNNBackwardWeights);
--CUDNN_DNN_ROUTINE_EACH_REMOVED_IN_E9(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
--#endif
--
--#if CUDNN_VERSION < 90000 && CUDNN_VERSION >= 7201
--#define CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7_REMOVED_IN_E9(__macro) \
--  __macro(cudnnSetRNNPaddingMode);                                 \
--  __macro(cudnnRNNForwardInferenceEx);                             \
--  __macro(cudnnRNNForwardTrainingEx);                              \
--  __macro(cudnnRNNBackwardDataEx);                                 \
--  __macro(cudnnRNNBackwardWeightsEx);
--CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7_REMOVED_IN_E9(
--    DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
--#endif
--
--#if CUDNN_VERSION >= 90000
--#define CUDNN_DNN_ROUTINE_EACH_R9(__macro) \
--  __macro(cudnnGetLastErrorString);        \
--  __macro(cudnnGetRNNWeightSpaceSize);     \
--  __macro(cudnnGetRNNTempSpaceSizes);      \
--  __macro(cudnnRNNForward);                \
--  __macro(cudnnRNNBackwardData_v8);        \
--  __macro(cudnnRNNBackwardWeights_v8);
--CUDNN_DNN_ROUTINE_EACH_R9(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
--#endif
- }  // namespace dynload
- }  // namespace phi
- 
 diff --git a/paddle/phi/backends/dynload/cufft.h b/paddle/phi/backends/dynload/cufft.h
 index 1547909d92..ef20838434 100644
 --- a/paddle/phi/backends/dynload/cufft.h
@@ -247,7 +161,7 @@ index 59e92955c9..d2f8c2da15 100644
 +#endif  // PADDLE_WITH_CUPTI
 \ No newline at end of file
 diff --git a/paddle/phi/backends/dynload/cusolver.h b/paddle/phi/backends/dynload/cusolver.h
-index 86651fc8f1..7c9b122a17 100644
+index 57e09bb6e4..87fb5b1797 100644
 --- a/paddle/phi/backends/dynload/cusolver.h
 +++ b/paddle/phi/backends/dynload/cusolver.h
@@ -34,7 +34,9 @@ extern void *cusolver_dso_handle;
@@ -262,7 +176,7 @@ index 86651fc8f1..7c9b122a17 100644
      }                                                                \
    };                                                                 \
 diff --git a/paddle/phi/backends/dynload/cusparse.h b/paddle/phi/backends/dynload/cusparse.h
-index 8ec3cf2792..6f5460df00 100644
+index e8cb0ac643..e8e7596d44 100644
 --- a/paddle/phi/backends/dynload/cusparse.h
 +++ b/paddle/phi/backends/dynload/cusparse.h
@@ -34,7 +34,9 @@ extern void *cusparse_dso_handle;
@@ -277,7 +191,7 @@ index 8ec3cf2792..6f5460df00 100644
      }                                                                \
    };                                                                 \
 diff --git a/paddle/phi/backends/dynload/dynamic_loader.cc b/paddle/phi/backends/dynload/dynamic_loader.cc
-index 859f696896..87b5100a1b 100644
+index c74ae9592e..f6dc68917c 100644
 --- a/paddle/phi/backends/dynload/dynamic_loader.cc
 +++ b/paddle/phi/backends/dynload/dynamic_loader.cc
@@ -18,7 +18,6 @@ limitations under the License. */
@@ -755,7 +669,7 @@ index 4eae698648..5c047723ea 100644
    return block_dim >= kMaxBlockDim ? kMaxBlockDim : lwarpSize;
  }
 diff --git a/paddle/phi/kernels/funcs/math_cuda_utils.h b/paddle/phi/kernels/funcs/math_cuda_utils.h
-index e5361b836e..5ad238df08 100644
+index dff1033db4..0098123818 100644
 --- a/paddle/phi/kernels/funcs/math_cuda_utils.h
 +++ b/paddle/phi/kernels/funcs/math_cuda_utils.h
@@ -175,12 +175,12 @@ struct KeyValuePair<half> {