RLHFlow/ArmoRM-Llama3-8B-v0.1
RLHFlow
Clasificación de texto
El modelo ArmoRM-Llama3-8B-v0.1 es un modelo de clasificación de texto que ha sido ajustado para múltiples objetivos de recompensa utilizando la metodología Mixture-of-Experts (MoE). Este modelo es parte de los esfuerzos de RLHFlow en modelado de recompensas, específico para preferencias interpretables. Ha sido ajustado a partir del modelo base FsfairX-LLaMA3-RM-v0.1 y destaca por su compatibilidad con AutoTrain y la inferencia de generación de texto.
Como usar
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
device = "cuda"
path = "RLHFlow/ArmoRM-Llama3-8B-v0.1"
model = AutoModelForSequenceClassification.from_pretrained(path, device_map=device,
trust_remote_code=True, torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(path, use_fast=True)
# We load a random sample from the validation set of the HelpSteer dataset
prompt = 'What are some synonyms for the word "beautiful"?'
response = "Nicely, Beautifully, Handsome, Stunning, Wonderful, Gorgeous, Pretty, Stunning, Elegant"
messages = [{"role": "user", "content": prompt},
{"role": "assistant", "content": response}]
input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt").to(device)
with torch.no_grad():
output = model(input_ids)
multi_obj_rewards = output.rewards.cpu().float()
gating_output = output.gating_output.cpu().float()
preference_score = output.score.cpu().float()
obj_transform = model.reward_transform_matrix.data.cpu().float()
multi_obj_coeffs = gating_output @ obj_transform.T
assert torch.isclose(torch.sum(multi_obj_rewards * multi_obj_coeffs, dim=1), preference_score, atol=1e-3)
K = 3
top_obj_dims = torch.argsort(torch.abs(multi_obj_coeffs), dim=1, descending=True,)[:, :K]
top_obj_coeffs = torch.gather(multi_obj_coeffs, dim=1, index=top_obj_dims)
attributes = ['helpsteer-helpfulness', 'helpsteer-correctness', 'helpsteer-coherence',
'helpsteer-complexity', 'helpsteer-verbosity', 'ultrafeedback-overall_score',
'ultrafeedback-instruction_following', 'ultrafeedback-truthfulness',
'ultrafeedback-honesty', 'ultrafeedback-helpfulness', 'beavertails-is_safe',
'prometheus-score', 'argilla-overall_quality', 'argilla-judge_lm', 'code-complexity',
'code-style', 'code-explanation', 'code-instruction-following', 'code-readability']
example_index = 0
for i in range(K):
attribute = attributes[top_obj_dims[example_index, i].item()]
coeff = top_obj_coeffs[example_index, i].item()
print(f"{attribute}: {round(coeff,5)}")
# The actual rewards of this example from the HelpSteer dataset
# are [3,3,4,2,2] for the five helpsteer objectives:
# helpfulness, correctness, coherence, complexity, verbosity
# We can linearly transform our predicted rewards to the
# original reward space to compare with the ground truth
helpsteer_rewards_pred = multi_obj_rewards[0, :5] * 5 - 0.5
print(helpsteer_rewards_pred)
from typing import Dict, List
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
class ArmoRMPipeline:
def __init__(self, model_id, device_map="auto", torch_dtype=torch.bfloat16, truncation=True, trust_remote_code=False, max_length=4096):
self.model = AutoModelForSequenceClassification.from_pretrained(
model_id,
device_map=device_map,
trust_remote_code=trust_remote_code,
torch_dtype=torch_dtype,
)
self.tokenizer = AutoTokenizer.from_pretrained(
model_id,
use_fast=True,
)
self.truncation = truncation
self.device = self.model.device
self.max_length = max_length
def __call__(self, messages: List[Dict[str, str]]) -> Dict[str, float]:
input_ids = self.tokenizer.apply_chat_template(
messages,
return_tensors="pt",
padding=True,
truncation=self.truncation,
max_length=self.max_length,
).to(self.device)
with torch.no_grad():
output = self.model(input_ids)
score = output.score.float().item()
return {"score": score}
# Create Reward Model Pipeline
prompt = 'What are some synonyms for the word "beautiful"?'
rm = ArmoRMPipeline("RLHFlow/ArmoRM-Llama3-8B-v0.1", trust_remote_code=True)
# score the messages
response1 = 'Nicely, Beautifully, Handsome, Stunning, Wonderful, Gorgeous, Pretty, Stunning, Elegant'
score1 = rm([{"role": "user", "content": prompt}, {"role": "assistant", "content": response1}])
print(score1)
response2 = '''Certainly! Here are some synonyms for the word "beautiful":
1. Gorgeous
2. Lovely
3. Stunning
4. Attractive
5. Pretty
6. Elegant
7. Exquisite
8. Handsome
9. Charming
10. Alluring
11. Radiant
12. Magnificent
13. Graceful
14. Enchanting
15. Dazzling
These synonyms can be used in various contexts to convey the idea of beauty.'''
score2 = rm([{"role": "user", "content": prompt}, {"role": "assistant", "content": response2}])
print(score2)
response3 = 'Sorry i cannot answer this.'
score3 = rm([{"role": "user", "content": prompt}, {"role": "assistant", "content": response3}])
print(score3)
Funcionalidades
- Clasificación de textos
- Uso de transformers en su arquitectura
- Soporte para safetensors
- Modelado de recompensas multi-objetivo
- Compatibilidad con inferencia de generación de texto
- Ajuste fino a partir del modelo FsfairX-LLaMA3-RM-v0.1
Casos de uso
- Clasificación de textos
- Modelado de recompensas multi-objetivo
- Evaluación de preferencias del usuario
- Reducción del sesgo de verbosidad en modelos de recompensa