Text Classification
Transformers
PyTorch
Vietnamese
vietnamese
summarization
evaluation
cross-encoder
research
Instructions to use phuongntc/MultiEvalVietSum with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use phuongntc/MultiEvalVietSum with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-classification", model="phuongntc/MultiEvalVietSum")# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("phuongntc/MultiEvalVietSum", dtype="auto") - Notebooks
- Google Colab
- Kaggle
| import torch | |
| from modeling_multievalvietsum import MultiEvalVietSumModel | |
| def build_pair_feature(tokenizer, document, summary, max_len=2048, summary_max_len=192): | |
| sum_ids = tokenizer( | |
| summary, | |
| truncation=True, | |
| max_length=summary_max_len, | |
| add_special_tokens=False, | |
| return_attention_mask=False, | |
| )["input_ids"] | |
| doc_ids = tokenizer( | |
| document, | |
| truncation=False, | |
| add_special_tokens=False, | |
| return_attention_mask=False, | |
| )["input_ids"] | |
| special_pair_tokens = tokenizer.num_special_tokens_to_add(pair=True) | |
| doc_budget = max(16, max_len - len(sum_ids) - special_pair_tokens) | |
| doc_ids = doc_ids[:doc_budget] | |
| model_inputs = getattr(tokenizer, "model_input_names", []) | |
| return_token_type_ids = "token_type_ids" in model_inputs | |
| try: | |
| feat = tokenizer.prepare_for_model( | |
| doc_ids, | |
| pair_ids=sum_ids, | |
| add_special_tokens=True, | |
| padding=False, | |
| truncation=False, | |
| return_attention_mask=True, | |
| return_token_type_ids=return_token_type_ids, | |
| ) | |
| feat = {k: v for k, v in feat.items() if k in {"input_ids", "attention_mask", "token_type_ids"}} | |
| return feat | |
| except Exception: | |
| cls_id = tokenizer.cls_token_id | |
| sep_id = tokenizer.sep_token_id | |
| input_ids = [cls_id] + doc_ids + [sep_id] + sum_ids + [sep_id] | |
| attention_mask = [1] * len(input_ids) | |
| feat = { | |
| "input_ids": input_ids, | |
| "attention_mask": attention_mask, | |
| } | |
| if return_token_type_ids: | |
| feat["token_type_ids"] = [0] * (len(doc_ids) + 2) + [1] * (len(sum_ids) + 1) | |
| return feat | |
| def predict_scores(document: str, summary: str, model_dir: str = "."): | |
| model, cfg = MultiEvalVietSumModel.from_pretrained_local(model_dir) | |
| tokenizer = MultiEvalVietSumModel.load_tokenizer_local(model_dir) | |
| feat = build_pair_feature( | |
| tokenizer, | |
| document=document, | |
| summary=summary, | |
| max_len=cfg["max_len"], | |
| summary_max_len=cfg["summary_max_len"], | |
| ) | |
| batch = { | |
| "input_ids": torch.tensor([feat["input_ids"]], dtype=torch.long), | |
| "attention_mask": torch.tensor([feat["attention_mask"]], dtype=torch.long), | |
| } | |
| if "token_type_ids" in feat: | |
| batch["token_type_ids"] = torch.tensor([feat["token_type_ids"]], dtype=torch.long) | |
| scores = model(**batch)[0].cpu().tolist() | |
| return { | |
| "faithfulness": float(scores[0]), | |
| "coherence": float(scores[1]), | |
| "relevance": float(scores[2]), | |
| } | |
| if __name__ == "__main__": | |
| doc = "Văn bản gốc mẫu." | |
| summ = "Bản tóm tắt mẫu." | |
| print(predict_scores(doc, summ)) | |