RLHFlow/ArmoRM-Llama3-8B-v0.1

RLHFlow
Clasificación de texto

El modelo ArmoRM-Llama3-8B-v0.1 es un modelo de clasificación de texto que ha sido ajustado para múltiples objetivos de recompensa utilizando la metodología Mixture-of-Experts (MoE). Este modelo es parte de los esfuerzos de RLHFlow en modelado de recompensas, específico para preferencias interpretables. Ha sido ajustado a partir del modelo base FsfairX-LLaMA3-RM-v0.1 y destaca por su compatibilidad con AutoTrain y la inferencia de generación de texto.

Como usar

import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
device = "cuda"
path = "RLHFlow/ArmoRM-Llama3-8B-v0.1"
model = AutoModelForSequenceClassification.from_pretrained(path, device_map=device, 
trust_remote_code=True, torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(path, use_fast=True)

# We load a random sample from the validation set of the HelpSteer dataset
prompt = 'What are some synonyms for the word "beautiful"?'
response = "Nicely, Beautifully, Handsome, Stunning, Wonderful, Gorgeous, Pretty, Stunning, Elegant"
messages = [{"role": "user", "content": prompt},
{"role": "assistant", "content": response}]
input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt").to(device)
with torch.no_grad():
    output = model(input_ids)
    multi_obj_rewards = output.rewards.cpu().float()
    gating_output = output.gating_output.cpu().float()
    preference_score = output.score.cpu().float()
    obj_transform = model.reward_transform_matrix.data.cpu().float()
    multi_obj_coeffs = gating_output @ obj_transform.T
    assert torch.isclose(torch.sum(multi_obj_rewards * multi_obj_coeffs, dim=1), preference_score, atol=1e-3)
    K = 3
    top_obj_dims = torch.argsort(torch.abs(multi_obj_coeffs), dim=1, descending=True,)[:, :K]
    top_obj_coeffs = torch.gather(multi_obj_coeffs, dim=1, index=top_obj_dims)
    attributes = ['helpsteer-helpfulness', 'helpsteer-correctness', 'helpsteer-coherence',
                  'helpsteer-complexity', 'helpsteer-verbosity', 'ultrafeedback-overall_score',
                  'ultrafeedback-instruction_following', 'ultrafeedback-truthfulness',
                  'ultrafeedback-honesty', 'ultrafeedback-helpfulness', 'beavertails-is_safe',
                  'prometheus-score', 'argilla-overall_quality', 'argilla-judge_lm', 'code-complexity',
                  'code-style', 'code-explanation', 'code-instruction-following', 'code-readability']

    example_index = 0
    for i in range(K):
        attribute = attributes[top_obj_dims[example_index, i].item()]
        coeff = top_obj_coeffs[example_index, i].item()
        print(f"{attribute}: {round(coeff,5)}")

# The actual rewards of this example from the HelpSteer dataset
# are [3,3,4,2,2] for the five helpsteer objectives:
# helpfulness, correctness, coherence, complexity, verbosity
# We can linearly transform our predicted rewards to the
# original reward space to compare with the ground truth
helpsteer_rewards_pred = multi_obj_rewards[0, :5] * 5 - 0.5
print(helpsteer_rewards_pred)
from typing import Dict, List
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

class ArmoRMPipeline:
    def __init__(self, model_id, device_map="auto", torch_dtype=torch.bfloat16, truncation=True, trust_remote_code=False, max_length=4096):
        self.model = AutoModelForSequenceClassification.from_pretrained(
            model_id,
            device_map=device_map,
            trust_remote_code=trust_remote_code,
            torch_dtype=torch_dtype,
        )
        self.tokenizer = AutoTokenizer.from_pretrained(
            model_id,
            use_fast=True,
        )
        self.truncation = truncation
        self.device = self.model.device
        self.max_length = max_length

    def __call__(self, messages: List[Dict[str, str]]) -> Dict[str, float]:
        input_ids = self.tokenizer.apply_chat_template(
            messages,
            return_tensors="pt",
            padding=True,
            truncation=self.truncation,
            max_length=self.max_length,
        ).to(self.device)
        with torch.no_grad():
            output = self.model(input_ids)
            score = output.score.float().item()
        return {"score": score}

# Create Reward Model Pipeline
prompt = 'What are some synonyms for the word "beautiful"?'
rm = ArmoRMPipeline("RLHFlow/ArmoRM-Llama3-8B-v0.1", trust_remote_code=True)
# score the messages
response1 = 'Nicely, Beautifully, Handsome, Stunning, Wonderful, Gorgeous, Pretty, Stunning, Elegant'
score1 = rm([{"role": "user", "content": prompt}, {"role": "assistant", "content": response1}])
print(score1)

response2 = '''Certainly! Here are some synonyms for the word "beautiful":

1. Gorgeous
2. Lovely
3. Stunning
4. Attractive
5. Pretty
6. Elegant
7. Exquisite
8. Handsome
9. Charming
10. Alluring
11. Radiant
12. Magnificent
13. Graceful
14. Enchanting
15. Dazzling

These synonyms can be used in various contexts to convey the idea of beauty.'''
score2 = rm([{"role": "user", "content": prompt}, {"role": "assistant", "content": response2}])
print(score2)

response3 = 'Sorry i cannot answer this.'
score3 = rm([{"role": "user", "content": prompt}, {"role": "assistant", "content": response3}])
print(score3)

Funcionalidades

Clasificación de textos
Uso de transformers en su arquitectura
Soporte para safetensors
Modelado de recompensas multi-objetivo
Compatibilidad con inferencia de generación de texto
Ajuste fino a partir del modelo FsfairX-LLaMA3-RM-v0.1

Casos de uso

Clasificación de textos
Modelado de recompensas multi-objetivo
Evaluación de preferencias del usuario
Reducción del sesgo de verbosidad en modelos de recompensa