-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Description
Hi, I had tried to test llama2 based on TensorRT-LLM.
my environments (based on "nvcr.io-nvidia-tritionserver-23.10-trtllm-python-py3"):
cuda 12.2
gpu A100 40G (1)
python 3.10.12
ubuntu 22.04.3
tensorrt 9.2.0.post12.dev5
tensorrt-llm 0.7.0
triton 2.1.0
I tried to use llama2 based on examples/llama/README
I had success build, when I run :
$ python build.py --model_dir meta-llama/llama-2-7b-hf\ --dtype float16 \ --remove_input_padding \ --use_gpt_attention_plugin float16 \ --enable_context_fmha \ --use_gemm_plugin float16 \ --output_dir ./tmp/llama/out/
when i tried to run engine after build :
$ python3 ../run.py --max_output_len=50 \ --tokenizer_dir meta-llama/llama-2-7b-hf \ --engine_dir=./tmp/llama/out/
I faced to this error message
[Instance-1891:140515:0:140515] Caught signal 11 (Segmentation fault: address not mapped to object at address 0x54000009)
==== backtrace (tid: 140515) ====
0 0x0000000000042520 __sigaction() ???:0
1 0x000000000006ce20 PMPI_Comm_set_errhandler() /build-result/src/hpcx-v2.16-gcc-inbox-ubuntu22.04-cuda12-gdrcopy2-nccl2.18-x86_64/ompi-db10576f403e833fdf7cd0d938e66b8393b20680/ompi/mpi/c/profile/pcomm_set_errhandler.c:81
2 0x000000000006ce20 opal_atomic_add_fetch_32() /build-result/src/hpcx-v2.16-gcc-inbox-ubuntu22.04-cuda12-gdrcopy2-nccl2.18-x86_64/ompi-db10576f403e833fdf7cd0d938e66b8393b20680/ompi/mpi/c/profile/../../../../opal/include/opal/sys/atomic_impl.h:384
3 0x000000000006ce20 opal_thread_add_fetch_32() /build-result/src/hpcx-v2.16-gcc-inbox-ubuntu22.04-cuda12-gdrcopy2-nccl2.18-x86_64/ompi-db10576f403e833fdf7cd0d938e66b8393b20680/ompi/mpi/c/profile/../../../../opal/threads/thread_usage.h:152
4 0x000000000006ce20 opal_obj_update() /build-result/src/hpcx-v2.16-gcc-inbox-ubuntu22.04-cuda12-gdrcopy2-nccl2.18-x86_64/ompi-db10576f403e833fdf7cd0d938e66b8393b20680/ompi/mpi/c/profile/../../../../opal/class/opal_object.h:534
5 0x000000000006ce20 PMPI_Comm_set_errhandler() /build-result/src/hpcx-v2.16-gcc-inbox-ubuntu22.04-cuda12-gdrcopy2-nccl2.18-x86_64/ompi-db10576f403e833fdf7cd0d938e66b8393b20680/ompi/mpi/c/profile/pcomm_set_errhandler.c:70
6 0x00000000000a728f __pyx_f_6mpi4py_3MPI_comm_set_eh() /tmp/pip-install-8x8e5fta/mpi4py_ff745ee2b9414fd99054a30ef67df184/src/mpi4py.MPI.c:40330
7 0x00000000000a728f __pyx_f_6mpi4py_3MPI_initialize() /tmp/pip-install-8x8e5fta/mpi4py_ff745ee2b9414fd99054a30ef67df184/src/mpi4py.MPI.c:8406
8 0x0000000000047e7c __pyx_f_6mpi4py_3MPI_initialize() /tmp/pip-install-8x8e5fta/mpi4py_ff745ee2b9414fd99054a30ef67df184/src/mpi4py.MPI.c:8394
9 0x000000000023b2d3 PyModule_ExecDef() ???:0
10 0x000000000023bda0 PyInit__thread() ???:0
11 0x000000000015f854 PyObject_GenericGetAttr() ???:0
12 0x000000000014b2c1 _PyEval_EvalFrameDefault() ???:0
13 0x000000000016070c _PyFunction_Vectorcall() ???:0
14 0x000000000014e8a2 _PyEval_EvalFrameDefault() ???:0
15 0x000000000016070c _PyFunction_Vectorcall() ???:0
16 0x0000000000148f52 _PyEval_EvalFrameDefault() ???:0
17 0x000000000016070c _PyFunction_Vectorcall() ???:0
18 0x0000000000148e0d _PyEval_EvalFrameDefault() ???:0
19 0x000000000016070c _PyFunction_Vectorcall() ???:0
20 0x0000000000148e0d _PyEval_EvalFrameDefault() ???:0
21 0x000000000016070c _PyFunction_Vectorcall() ???:0
22 0x000000000015fb24 PyObject_CallFunctionObjArgs() ???:0
23 0x000000000023f4af _PyObject_CallMethodIdObjArgs() ???:0
24 0x00000000001740ca PyImport_ImportModuleLevelObject() ???:0
25 0x0000000000184458 PyImport_Import() ???:0
26 0x000000000015fe0e PyObject_CallFunctionObjArgs() ???:0
27 0x000000000016f12b PyObject_Call() ???:0
28 0x000000000014b2c1 _PyEval_EvalFrameDefault() ???:0
29 0x000000000016070c _PyFunction_Vectorcall() ???:0
30 0x0000000000148e0d _PyEval_EvalFrameDefault() ???:0
31 0x000000000016070c _PyFunction_Vectorcall() ???:0
32 0x000000000015fb24 PyObject_CallFunctionObjArgs() ???:0
33 0x000000000023f4af _PyObject_CallMethodIdObjArgs() ???:0
34 0x0000000000174cda PyImport_ImportModuleLevelObject() ???:0
35 0x0000000000151216 _PyEval_EvalFrameDefault() ???:0
36 0x000000000016070c _PyFunction_Vectorcall() ???:0
37 0x0000000000148e0d _PyEval_EvalFrameDefault() ???:0
38 0x000000000016070c _PyFunction_Vectorcall() ???:0
39 0x000000000014e8a2 _PyEval_EvalFrameDefault() ???:0
40 0x000000000016070c _PyFunction_Vectorcall() ???:0
41 0x0000000000148e0d _PyEval_EvalFrameDefault() ???:0
42 0x0000000000239e56 PyEval_EvalCode() ???:0
43 0x0000000000239cf6 PyEval_EvalCode() ???:0
44 0x00000000002647d8 PyUnicode_Tailmatch() ???:0
45 0x000000000025e0bb PyInit__collections() ???:0
46 0x0000000000264525 PyUnicode_Tailmatch() ???:0
47 0x0000000000263a08 _PyRun_SimpleFileObject() ???:0
48 0x0000000000263653 _PyRun_AnyFileObject() ???:0
49 0x000000000025641e Py_RunMain() ???:0
50 0x000000000022ccad Py_BytesMain() ???:0
51 0x0000000000029d90 __libc_init_first() ???:0
52 0x0000000000029e40 __libc_start_main() ???:0
53 0x000000000022cba5 _start() ???:0python3:140515 terminated with signal 11 at PC=7f887654de20 SP=7ffdd00247f0. Backtrace:
/usr/local/mpi/lib/libmpi.so.40(PMPI_Comm_set_errhandler+0xb0)[0x7f887654de20]
/usr/local/lib/python3.10/dist-packages/mpi4py/MPI.cpython-310-x86_64-linux-gnu.so(+0xa728f)[0x7f8875d4728f]
/usr/local/lib/python3.10/dist-packages/mpi4py/MPI.cpython-310-x86_64-linux-gnu.so(+0x47e7c)[0x7f8875ce7e7c]
python3(PyModule_ExecDef+0x73)[0x55704666c2d3]
python3(+0x23bda0)[0x55704666cda0]
python3(+0x15f854)[0x557046590854]
python3(_PyEval_EvalFrameDefault+0x2b71)[0x55704657c2c1]
python3(_PyFunction_Vectorcall+0x7c)[0x55704659170c]
python3(_PyEval_EvalFrameDefault+0x6152)[0x55704657f8a2]
python3(_PyFunction_Vectorcall+0x7c)[0x55704659170c]
python3(_PyEval_EvalFrameDefault+0x802)[0x557046579f52]
python3(_PyFunction_Vectorcall+0x7c)[0x55704659170c]
python3(_PyEval_EvalFrameDefault+0x6bd)[0x557046579e0d]
python3(_PyFunction_Vectorcall+0x7c)[0x55704659170c]
python3(_PyEval_EvalFrameDefault+0x6bd)[0x557046579e0d]
python3(_PyFunction_Vectorcall+0x7c)[0x55704659170c]
python3(+0x15fb24)[0x557046590b24]
python3(_PyObject_CallMethodIdObjArgs+0xff)[0x5570466704af]
python3(PyImport_ImportModuleLevelObject+0x25a)[0x5570465a50ca]
python3(+0x184458)[0x5570465b5458]
python3(+0x15fe0e)[0x557046590e0e]
python3(PyObject_Call+0xbb)[0x5570465a012b]
python3(_PyEval_EvalFrameDefault+0x2b71)[0x55704657c2c1]
python3(_PyFunction_Vectorcall+0x7c)[0x55704659170c]
python3(_PyEval_EvalFrameDefault+0x6bd)[0x557046579e0d]
python3(_PyFunction_Vectorcall+0x7c)[0x55704659170c]
python3(+0x15fb24)[0x557046590b24]
python3(_PyObject_CallMethodIdObjArgs+0xff)[0x5570466704af]
python3(PyImport_ImportModuleLevelObject+0xe6a)[0x5570465a5cda]
python3(_PyEval_EvalFrameDefault+0x8ac6)[0x557046582216]
python3(_PyFunction_Vectorcall+0x7c)[0x55704659170c]
python3(_PyEval_EvalFrameDefault+0x6bd)[0x557046579e0d]
python3(_PyFunction_Vectorcall+0x7c)[0x55704659170c]
python3(_PyEval_EvalFrameDefault+0x6152)[0x55704657f8a2]
python3(_PyFunction_Vectorcall+0x7c)[0x55704659170c]
python3(_PyEval_EvalFrameDefault+0x6bd)[0x557046579e0d]
python3(+0x239e56)[0x55704666ae56]
python3(PyEval_EvalCode+0x86)[0x55704666acf6]
python3(+0x2647d8)[0x5570466957d8]
python3(+0x25e0bb)[0x55704668f0bb]
python3(+0x264525)[0x557046695525]
python3(_PyRun_SimpleFileObject+0x1a8)[0x557046694a08]
python3(_PyRun_AnyFileObject+0x43)[0x557046694653]
python3(Py_RunMain+0x2be)[0x55704668741e]
python3(Py_BytesMain+0x2d)[0x55704665dcad]
/lib/x86_64-linux-gnu/libc.so.6(+0x29d90)[0x7f8aa41e1d90]
/lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0x80)[0x7f8aa41e1e40]
python3(_start+0x25)[0x55704665dba5]
When I tested GPT-NeoX Examples (using Polyglot Models), I got same issue.
I can use build.py that, but I can't use run.py that and got same error message.
I guess it is because of mpi.
Can any body help me solve this problem?