Demo Fixes 6

This commit is contained in:
2025-03-30 03:03:14 -04:00
parent bfaffef684
commit fdb92ff061

View File

@@ -112,6 +112,15 @@ def load_models():
torch_dtype=torch.bfloat16 torch_dtype=torch.bfloat16
) )
models.tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B") models.tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
# Configure all special tokens
models.tokenizer.pad_token = models.tokenizer.eos_token
models.tokenizer.padding_side = "left" # For causal language modeling
# Inform the model about the pad token
if hasattr(models.llm.config, "pad_token_id") and models.llm.config.pad_token_id is None:
models.llm.config.pad_token_id = models.tokenizer.pad_token_id
logger.info("Llama 3.2 model loaded successfully") logger.info("Llama 3.2 model loaded successfully")
socketio.emit('model_status', {'model': 'llm', 'status': 'loaded'}) socketio.emit('model_status', {'model': 'llm', 'status': 'loaded'})
progress = 100 progress = 100
@@ -392,31 +401,41 @@ def process_audio_and_respond(session_id, data):
prompt = f"{conversation_history}Assistant: " prompt = f"{conversation_history}Assistant: "
# Generate response with Llama # Generate response with Llama
input_tokens = models.tokenizer( try:
prompt, # Ensure pad token is set
return_tensors="pt", if models.tokenizer.pad_token is None:
padding=True, models.tokenizer.pad_token = models.tokenizer.eos_token
return_attention_mask=True
)
input_ids = input_tokens.input_ids.to(DEVICE)
attention_mask = input_tokens.attention_mask.to(DEVICE)
with torch.no_grad(): input_tokens = models.tokenizer(
generated_ids = models.llm.generate( prompt,
input_ids, return_tensors="pt",
attention_mask=attention_mask, padding=True,
max_new_tokens=100, return_attention_mask=True
temperature=0.7,
top_p=0.9,
do_sample=True,
pad_token_id=models.tokenizer.eos_token_id
) )
input_ids = input_tokens.input_ids.to(DEVICE)
attention_mask = input_tokens.attention_mask.to(DEVICE)
# Decode the response with torch.no_grad():
response_text = models.tokenizer.decode( generated_ids = models.llm.generate(
generated_ids[0][input_ids.shape[1]:], input_ids,
skip_special_tokens=True attention_mask=attention_mask,
).strip() max_new_tokens=100,
temperature=0.7,
top_p=0.9,
do_sample=True,
pad_token_id=models.tokenizer.eos_token_id
)
# Decode the response
response_text = models.tokenizer.decode(
generated_ids[0][input_ids.shape[1]:],
skip_special_tokens=True
).strip()
except Exception as e:
logger.error(f"Error generating response: {str(e)}")
import traceback
logger.error(traceback.format_exc())
response_text = "I'm sorry, I encountered an error while processing your request."
# Synthesize speech # Synthesize speech
with app.app_context(): with app.app_context():