Azure99 commited on
Commit
523b6bc
·
verified ·
1 Parent(s): b689c89

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -22
app.py CHANGED
@@ -1,19 +1,24 @@
1
  import json
 
 
2
  import gradio as gr
3
  import spaces
4
- from huggingface_hub import hf_hub_download
5
- from llama_cpp import Llama
6
- from transformers import AutoTokenizer
 
 
 
7
 
8
  MAX_NEW_TOKENS = 8192
9
  MODEL_NAME = "Azure99/Blossom-V6.3-36B"
10
- MODEL_GGUF_REPO = f"{MODEL_NAME}-GGUF"
11
- MODEL_FILE = "blossom-v6.3-36b-q8_0.gguf"
12
- MODEL_LOCAL_DIR = "./"
13
-
14
- hf_hub_download(repo_id=MODEL_GGUF_REPO, filename=MODEL_FILE, local_dir=MODEL_LOCAL_DIR)
15
 
16
- llm: Llama = None
 
 
 
 
 
17
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
18
 
19
 
@@ -33,27 +38,31 @@ def get_messages(user, history):
33
 
34
  @spaces.GPU(duration=120)
35
  def chat(user, history, temperature, top_p, repetition_penalty):
36
- global llm
37
- if llm is None:
38
- llm = Llama(
39
- model_path=MODEL_FILE, n_gpu_layers=-1, flash_attn=True, n_ctx=16384
40
- )
41
 
42
  messages = get_messages(user, history)
43
  print(f"Messages: {messages}")
44
- input_ids = tokenizer.apply_chat_template(messages)
45
- generate_config = dict(
 
 
 
 
 
 
 
46
  temperature=temperature,
47
  top_p=top_p,
48
- repeat_penalty=repetition_penalty,
49
- top_k=0,
50
- stream=True,
51
- max_tokens=MAX_NEW_TOKENS,
52
  )
53
 
 
 
54
  outputs = ""
55
- for chunk in llm(input_ids, **generate_config):
56
- outputs += chunk["choices"][0]["text"]
57
  yield outputs
58
 
59
 
 
1
  import json
2
+ from threading import Thread
3
+
4
  import gradio as gr
5
  import spaces
6
+ from transformers import (
7
+ AutoModelForCausalLM,
8
+ AutoTokenizer,
9
+ TextIteratorStreamer,
10
+ FineGrainedFP8Config,
11
+ )
12
 
13
  MAX_NEW_TOKENS = 8192
14
  MODEL_NAME = "Azure99/Blossom-V6.3-36B"
 
 
 
 
 
15
 
16
+ model = AutoModelForCausalLM.from_pretrained(
17
+ MODEL_NAME,
18
+ torch_dtype="auto",
19
+ device_map="auto",
20
+ quantization_config=FineGrainedFP8Config(),
21
+ )
22
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
23
 
24
 
 
38
 
39
  @spaces.GPU(duration=120)
40
  def chat(user, history, temperature, top_p, repetition_penalty):
41
+ streamer = TextIteratorStreamer(
42
+ tokenizer, skip_prompt=True, skip_special_tokens=True
43
+ )
 
 
44
 
45
  messages = get_messages(user, history)
46
  print(f"Messages: {messages}")
47
+ input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt").to(
48
+ model.device
49
+ )
50
+
51
+ generation_kwargs = dict(
52
+ input_ids=input_ids,
53
+ streamer=streamer,
54
+ do_sample=True,
55
+ max_new_tokens=MAX_NEW_TOKENS,
56
  temperature=temperature,
57
  top_p=top_p,
58
+ repetition_penalty=repetition_penalty,
 
 
 
59
  )
60
 
61
+ Thread(target=model.generate, kwargs=generation_kwargs).start()
62
+
63
  outputs = ""
64
+ for new_text in streamer:
65
+ outputs += new_text
66
  yield outputs
67
 
68