SlimPLM
Collection
Open-source models for SlimPLM. • 3 items • Updated
How to use zstanjj/SlimPLM-Query-Rewriting with Transformers:
# Use a pipeline as a high-level helper
from transformers import pipeline
pipe = pipeline("text-generation", model="zstanjj/SlimPLM-Query-Rewriting") # Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM
tokenizer = AutoTokenizer.from_pretrained("zstanjj/SlimPLM-Query-Rewriting")
model = AutoModelForCausalLM.from_pretrained("zstanjj/SlimPLM-Query-Rewriting")How to use zstanjj/SlimPLM-Query-Rewriting with vLLM:
# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "zstanjj/SlimPLM-Query-Rewriting"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/completions" \
-H "Content-Type: application/json" \
--data '{
"model": "zstanjj/SlimPLM-Query-Rewriting",
"prompt": "Once upon a time,",
"max_tokens": 512,
"temperature": 0.5
}'docker model run hf.co/zstanjj/SlimPLM-Query-Rewriting
How to use zstanjj/SlimPLM-Query-Rewriting with SGLang:
# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
--model-path "zstanjj/SlimPLM-Query-Rewriting" \
--host 0.0.0.0 \
--port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/completions" \
-H "Content-Type: application/json" \
--data '{
"model": "zstanjj/SlimPLM-Query-Rewriting",
"prompt": "Once upon a time,",
"max_tokens": 512,
"temperature": 0.5
}'docker run --gpus all \
--shm-size 32g \
-p 30000:30000 \
-v ~/.cache/huggingface:/root/.cache/huggingface \
--env "HF_TOKEN=<secret>" \
--ipc=host \
lmsysorg/sglang:latest \
python3 -m sglang.launch_server \
--model-path "zstanjj/SlimPLM-Query-Rewriting" \
--host 0.0.0.0 \
--port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/completions" \
-H "Content-Type: application/json" \
--data '{
"model": "zstanjj/SlimPLM-Query-Rewriting",
"prompt": "Once upon a time,",
"max_tokens": 512,
"temperature": 0.5
}'How to use zstanjj/SlimPLM-Query-Rewriting with Docker Model Runner:
docker model run hf.co/zstanjj/SlimPLM-Query-Rewriting
📝 Paper • 🤗 Hugging Face • 🧩 Github
🌹 If you use this model, please star our GitHub repository to support us. Your star means a lot!
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
# construct prompt
question = "Who voices Darth Vader in Star Wars Episodes III-VI, IX Rogue One, and Rebels?"
heuristic_answer = "The voice of Darth Vader in Star Wars is provided by British actor James Earl Jones. He first voiced the character in the 1977 film \"Star Wars: Episode IV - A New Hope\", and his performance has been used in all subsequent Star Wars films, including the prequels and sequels."
prompt = (f"<s>[INST] <<SYS>>\nYou are a helpful assistant. Your task is to parse user input into"
f" structured formats according to the coarse answer. Current datatime is 2023-12-20 9:47:28"
f" <</SYS>>\n Course answer: (({heuristic_answer}))\nQuestion: (({question})) [/INST]")
# alternatively you can input question only
# prompt = (f"<s>[INST] <<SYS>>\nYou are a helpful assistant. Your task is to parse user input into"
# f" structured formats. Current datatime is 2023-12-20 9:47:28"
# f" <</SYS>>\n{question} [/INST]")
params_query_rewrite = {"repetition_penalty": 1.05, "temperature": 0.01, "top_k": 1, "top_p": 0.85,
"max_new_tokens": 512, "do_sample": False, "seed": 2023}
# deploy model
model = AutoModelForCausalLM.from_pretrained("zstanjj/SlimPLM-Query-Rewriting").eval()
if torch.cuda.is_available():
model.cuda()
tokenizer = AutoTokenizer.from_pretrained("zstanjj/SlimPLM-Query-Rewriting")
# run inference
input_ids = tokenizer.encode(prompt.format(question=question, answer=heuristic_answer), return_tensors="pt")
len_input_ids = len(input_ids[0])
if torch.cuda.is_available():
input_ids = input_ids.cuda()
outputs = model.generate(input_ids)
res = tokenizer.decode(outputs[0][len_input_ids:], skip_special_tokens=True)
print(res)
@inproceedings{Tan2024SmallMB,
title={Small Models, Big Insights: Leveraging Slim Proxy Models To Decide When and What to Retrieve for LLMs},
author={Jiejun Tan and Zhicheng Dou and Yutao Zhu and Peidong Guo and Kun Fang and Ji-Rong Wen},
year={2024},
url={https://arxiv.org/abs/2402.12052}
}