deberta-large-japanese-unidic-ud-head
KoichiYasuoka
Pregunta y respuesta
Este es un modelo DeBERTa(V2) preentrenado en 青空文庫 para análisis de dependencias (detección de cabezas en palabras de unidad larga) como respuesta a preguntas, derivado de deberta-large-japanese-unidic y UD_Japanese-GSDLUW. Utilice [MASK] dentro del contexto para evitar ambigüedades al especificar una palabra de uso múltiple como pregunta.
Como usar
import torch
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
tokenizer = AutoTokenizer.from_pretrained('KoichiYasuoka/deberta-large-japanese-unidic-ud-head')
model = AutoModelForQuestionAnswering.from_pretrained('KoichiYasuoka/deberta-large-japanese-unidic-ud-head')
question = '国語'
context = '全学年にわたって小学校の国語の教科書に挿し絵が用いられている'
inputs = tokenizer(question, context, return_tensors='pt')
outputs = model(**inputs)
start, end = torch.argmax(outputs.start_logits), torch.argmax(outputs.end_logits)
print(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0, start:end+1]))
from transformers import (AutoTokenizer, AutoModelForQuestionAnswering, AutoModelForTokenClassification, AutoConfig, TokenClassificationPipeline)
class TaggerPipeline(TokenClassificationPipeline):
def __call__(self, text):
d = super().__call__(text)
if len(d) > 0 and ('start' not in d[0] or d[0]['start'] is None):
import spacy_alignments as tokenizations
v = [x['word'].replace(' ', '') for x in d]
a2b, b2a = tokenizations.get_alignments(v, text)
for i, t in enumerate(a2b):
s, e = (0, 0) if t == [] else (t[0], t[-1] + 1)
if v[i].startswith(self.tokenizer.unk_token):
s = ([[-1]] + [x for x in a2b[0:i] if x != []])[-1][-1] + 1
if v[i].endswith(self.tokenizer.unk_token):
e = ([x for x in a2b[i+1:] if x != []] + [[len(text)]])[0][0]
d[i]['start'], d[i]['end'] = s, e
return d
class TransformersSlowUD(object):
def __init__(self, bert):
import os
self.tokenizer = AutoTokenizer.from_pretrained(bert)
self.model = AutoModelForQuestionAnswering.from_pretrained(bert)
x = AutoModelForTokenClassification.from_pretrained
if os.path.isdir(bert):
d, t = x(os.path.join(bert, 'deprel')), x(os.path.join(bert, 'tagger'))
else:
from transformers.utils import cached_file
c = AutoConfig.from_pretrained(cached_file(bert, 'deprel/config.json'))
d = x(cached_file(bert, 'deprel/pytorch_model.bin'), config = c)
s = AutoConfig.from_pretrained(cached_file(bert, 'tagger/config.json'))
t = x(cached_file(bert, 'tagger/pytorch_model.bin'), config = s)
self.deprel = TaggerPipeline(model = d, tokenizer = self.tokenizer, aggregation_strategy = 'simple')
self.tagger = TaggerPipeline(model = t, tokenizer = self.tokenizer)
def __call__(self, text):
import numpy, torch, ufal.chu_liu_edmonds
w = [(t['start'], t['end'], t['entity_group']) for t in self.deprel(text)]
z, n = {t['start']: t['entity'].split('|') for t in self.tagger(text)}, len(w)
r, m = [text[s:e] for s, e, p in w], numpy.full((n+1, n+1), numpy.nan)
v, c = self.tokenizer(r, add_special_tokens = False)['input_ids'], []
for i, t in enumerate(v):
q = [self.tokenizer.cls_token_id] + t + [self.tokenizer.sep_token_id]
c.append([q] + v[0:i] + [[self.tokenizer.mask_token_id]] + v[i+1:] + [[q[-1]]])
b = [[len(sum(x[0:j+1], [])) for j in range(len(x))] for x in c]
with torch.no_grad():
d = self.model(input_ids = torch.tensor([sum(x, []) for x in c]), token_type_ids = torch.tensor([[0]*x[0] + [1]*(x[-1] - x[0]) for x in b]))
s, e = d.start_logits.tolist(), d.end_logits.tolist()
for i in range(n):
for j in range(n):
m[i+1, 0 if i == j else j+1] = s[i][b[i][j]] + e[i][b[i][j+1] - 1]
h = ufal.chu_liu_edmonds.chu_liu_edmonds(m)[0]
if [0 for i in h if i == 0] != [0]:
i = ([p for s, e, p in w] + ['root']).index('root')
j = i + 1 if i else numpy.nanargmax(m[:, 0])
m[0:j, 0] = m[j+1:, 0] = numpy.nan
h = ufal.chu_liu_edmonds.chu_liu_edmonds(m)[0]
u = '# text = ' + text.replace('\n', ' ') + '\n'
for i, (s, e, p) in enumerate(w, 1):
p = 'root' if h[i] == 0 else 'dep' if p == 'root' else p
u += '\t'.join([str(i), r[i-1], '_', z[s][0][2:], '_', '|'.join(z[s][1:]), str(h[i]), p, '_', '_' if i and e 0] else 'SpaceAfter=No']) + '\n'
return u + '\n'
nlp = TransformersSlowUD('KoichiYasuoka/deberta-large-japanese-unidic-ud-head')
print(nlp('全学年にわたって小学校の国語の教科書に挿し絵が用いられている'))
Funcionalidades
- Respuesta a preguntas
- Análisis de dependencias
- Detección de cabezas en palabras de unidad larga
- Basado en DeBERTa(V2)
Casos de uso
- Análisis de dependencias en japonés
- Detección de cabezas en palabras de unidad larga
- Respuesta a preguntas en japonés