diff --git a/nlp/llm/llama3_8b/openrlhf/README.md b/nlp/llm/llama3_8b/openrlhf/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..e65b8c46b9be72b5e91a0d4e9bb72694845bf183
--- /dev/null
+++ b/nlp/llm/llama3_8b/openrlhf/README.md
@@ -0,0 +1,67 @@
+# Llama3-8B (OpenRLHF)
+
+## Model Description
+
+Llama3-8B is an advanced auto-regressive language model developed by Meta, featuring 8 billion parameters. It utilizes
+an optimized transformer architecture with Grouped-Query Attention (GQA) for improved inference efficiency. Trained on
+sequences of 8,192 tokens and using a 128K token vocabulary, it excels in various natural language tasks. The model
+incorporates supervised fine-tuning (SFT) and reinforcement learning with human feedback (RLHF) to align with human
+preferences, ensuring both helpfulness and safety in its responses. Llama3-8B offers state-of-the-art performance in
+language understanding and generation.
+
+## Supported Environments
+
+| GPU    | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release |
+| :----: | :----: | :----: |
+| BI-V150 | 4.2.0     |  25.06  |
+
+## Model Preparation
+
+### Prepare Resources
+
+```sh
+git clone https://github.com/OpenRLHF/OpenRLHF.git -b v0.5.7
+cd examples/scripts/
+mkdir -p OpenRLHF
+mkdir -p Dylan2048
+# get datasets from huggingface
+# for dpo: OpenRLHF/preference_dataset_mixture2_and_safe_pku
+# for kto: Dylan2048/ultrafeedback-unpaired-preferences
+# for ppo: OpenRLHF/prompt-collection-v0.1
+
+# get pretrain model from huggingface: https://huggingface.co/OpenRLHF/Llama-3-8b-sft-mixture
+# get reward_pretrain model from huggingface: https://huggingface.co/OpenRLHF/Llama-3-8b-rm-mixture
+```
+
+### Install Dependencies
+
+```sh
+# install
+cp requirements.txt OpenRLHF/requirements.txt
+cd OpenRLHF
+pip install -e .
+```
+
+## Model Training
+
+```sh
+# Make sure you have need 16 BI-V150
+cp *.sh OpenRLHF/examples/scripts/
+cd OpenRLHF/examples/scripts/
+# train with dpo
+bash train_dpo_llama.sh
+
+# train with kto
+bash train_kto_llama.sh
+
+# train with ppo
+bash train_ppo_llama.sh
+
+# Tips:
+# If you throw out: FileNotFoundError: Directory OpenRLHF/prompt-collection-v0.1 is neither a `Dataset` directory nor a `DatasetDict` directory.
+# please modify OpenRLHF/openrlhf/utils/utils.py:76 `data = load_from_disk(dataset)` --> `data = load_dataset(dataset, data_dir=data_dir)`
+```
+
+## References
+
+- [OpenRLHF](https://github.com/OpenRLHF/OpenRLHF)
diff --git a/nlp/llm/llama3_8b/openrlhf/requirements.txt b/nlp/llm/llama3_8b/openrlhf/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..47aa5ab23e322aaf421b0782fed6f0caaf7d9163
--- /dev/null
+++ b/nlp/llm/llama3_8b/openrlhf/requirements.txt
@@ -0,0 +1,18 @@
+accelerate
+bitsandbytes
+datasets
+einops
+isort
+jsonlines
+loralib
+optimum
+packaging
+peft
+ray
+tensorboard
+torch
+torchmetrics
+tqdm
+transformers_stream_generator
+wandb
+wheel
\ No newline at end of file
diff --git a/nlp/llm/llama3_8b/openrlhf/train_dpo_llama.sh b/nlp/llm/llama3_8b/openrlhf/train_dpo_llama.sh
new file mode 100644
index 0000000000000000000000000000000000000000..ed4b31cb0a27cc8181612f40c2f1e67a3883e448
--- /dev/null
+++ b/nlp/llm/llama3_8b/openrlhf/train_dpo_llama.sh
@@ -0,0 +1,36 @@
+set -x
+
+read -r -d '' training_commands <<EOF
+openrlhf.cli.train_dpo \
+   --save_path ./checkpoint/llama3-8b-dpo \
+   --save_steps -1 \
+   --logging_steps 1 \
+   --eval_steps -1 \
+   --train_batch_size 32 \
+   --micro_train_batch_size 1 \
+   --pretrain OpenRLHF/Llama-3-8b-sft-mixture \
+   --bf16 \
+   --max_epochs 1 \
+   --max_len 1024 \
+   --zero_stage 3 \
+   --learning_rate 5e-7 \
+   --beta 0.1 \
+   --dataset OpenRLHF/preference_dataset_mixture2_and_safe_pku \
+   --apply_chat_template \
+   --chosen_key chosen \
+   --rejected_key rejected \
+   --flash_attn \
+   --load_checkpoint \
+   --gradient_checkpointing
+EOF
+    # --use_wandb [WANDB_TOKENS] or True (use wandb login command)
+    # --ipo [for IPO]
+    # --label_smoothing 0.1 [for cDPO]
+    # --ref_offload
+    # --packing_samples
+    # --nll_loss_coef (Regularization with NLL loss)
+
+
+if [[ ${1} != "slurm" ]]; then
+    deepspeed --module $training_commands
+fi
\ No newline at end of file
diff --git a/nlp/llm/llama3_8b/openrlhf/train_kto_llama.sh b/nlp/llm/llama3_8b/openrlhf/train_kto_llama.sh
new file mode 100644
index 0000000000000000000000000000000000000000..0fe909aca3c2d28c6e16746239ca9eb56b904bd0
--- /dev/null
+++ b/nlp/llm/llama3_8b/openrlhf/train_kto_llama.sh
@@ -0,0 +1,31 @@
+set -x
+
+read -r -d '' training_commands <<EOF
+openrlhf.cli.train_kto \
+   --save_path ./checkpoint/llama3-8b-kto \
+   --save_steps -1 \
+   --logging_steps 1 \
+   --eval_steps -1 \
+   --train_batch_size 32 \
+   --micro_train_batch_size 1 \
+   --pretrain OpenRLHF/Llama-3-8b-sft-mixture \
+   --bf16 \
+   --max_epochs 1 \
+   --max_len 1024 \
+   --zero_stage 3 \
+   --learning_rate 5e-7 \
+   --dataset OpenRLHF/ultrafeedback-unpaired-preferences \
+   --input_key instruction \
+   --output_key response \
+   --label_key score \
+   --flash_attn \
+   --beta 0.1 \
+   --max_samples 1024 \
+   --gradient_checkpointing
+EOF
+    # --use_wandb [WANDB_TOKENS] or True (use wandb login command)
+
+
+if [[ ${1} != "slurm" ]]; then
+    deepspeed --module $training_commands
+fi
\ No newline at end of file
diff --git a/nlp/llm/llama3_8b/openrlhf/train_ppo_llama.sh b/nlp/llm/llama3_8b/openrlhf/train_ppo_llama.sh
new file mode 100644
index 0000000000000000000000000000000000000000..a0d6972e559b46395761994d14044f53941f36cd
--- /dev/null
+++ b/nlp/llm/llama3_8b/openrlhf/train_ppo_llama.sh
@@ -0,0 +1,40 @@
+set -x
+
+read -r -d '' training_commands <<EOF
+openrlhf.cli.train_ppo \
+   --pretrain OpenRLHF/Llama-3-8b-sft-mixture \
+   --reward_pretrain OpenRLHF/Llama-3-8b-rm-mixture \
+   --save_path ./checkpoint/llama-3-8b-rlhf \
+   --save_steps -1 \
+   --logging_steps 1 \
+   --eval_steps -1 \
+   --micro_train_batch_size 1 \
+   --train_batch_size 128 \
+   --micro_rollout_batch_size 1 \
+   --rollout_batch_size 1024 \
+   --max_epochs 1 \
+   --prompt_max_len 1024 \
+   --generate_max_len 1024 \
+   --zero_stage 3 \
+   --bf16 \
+   --actor_learning_rate 5e-7 \
+   --critic_learning_rate 9e-6 \
+   --init_kl_coef 0.01 \
+   --prompt_data OpenRLHF/prompt-collection-v0.1 \
+   --input_key context_messages \
+   --apply_chat_template \
+   --max_samples 100000 \
+   --normalize_reward \
+   --adam_offload \
+   --flash_attn \
+   --load_checkpoint \
+   --gradient_checkpointing
+EOF
+
+    # --packing_samples
+    # --use_wandb [WANDB_TOKENS] or True (use wandb login command)
+    # --remote_rm_url http://localhost:5000/get_reward
+
+if [[ ${1} != "slurm" ]]; then
+    deepspeed --module $training_commands
+fi
\ No newline at end of file