nvidia/Nemotron-PII
Viewer • Updated • 200k • 4.06k • 99
How to use PITTI/privacy-filter-nemotron with Transformers:
# Use a pipeline as a high-level helper
from transformers import pipeline
pipe = pipeline("token-classification", model="PITTI/privacy-filter-nemotron") # Load model directly
from transformers import AutoTokenizer, AutoModelForTokenClassification
tokenizer = AutoTokenizer.from_pretrained("PITTI/privacy-filter-nemotron")
model = AutoModelForTokenClassification.from_pretrained("PITTI/privacy-filter-nemotron")This model can be used with mlx-raclate for native inference on Apple Silicon.
from mlx_raclate.utils.utils import load
from mlx_raclate.utils.token_classification import (
postprocess_token_classification_output,
viterbi_transition_biases_from_calibration,
)
# Load model and tokenizer
model_path = "PITTI/privacy-filter-nemotron"
model, tokenizer = load(
model_path,
pipeline="token-classification"
)
# Prepare input texts
texts = ['John works at Apple in California.', 'Microsoft was founded by Bill Gates.']
# Tokenize
max_length = getattr(model.config, "max_position_embeddings", 512)
tokens = tokenizer._tokenizer(
texts,
return_tensors="mlx",
padding=True,
truncation=True,
max_length=max_length,
return_offsets_mapping=True,
)
offset_mapping = tokens.pop("offset_mapping")
# Run inference
outputs = model(
input_ids=tokens["input_ids"],
attention_mask=tokens["attention_mask"],
return_dict=True
)
# Get predictions
logits = outputs["logits"]
id2label = model.config.id2label
transition_biases = viterbi_transition_biases_from_calibration(
getattr(model, "viterbi_calibration", None)
)
processed = postprocess_token_classification_output(
logits=logits,
probabilities=outputs["probabilities"],
id2label=id2label,
texts=texts,
offsets=offset_mapping.tolist(),
transition_biases=transition_biases,
)
# Process and print grouped spans
for i, text in enumerate(texts):
print(f"Text: {text}")
print("Grouped spans:")
for span in processed["grouped_spans"][i]:
print(f" {span['entity_group']}: {span['word']!r} [{span['start']}, {span['end']}] score={span['score']:.3f}")
print()
OpenAI Privacy Filter models finetuned with Raclate are natively supported by Transformers. However, post-processing is necessary.
With transformers>=5.8.1, this checkpoint uses the standard Hugging Face openai_privacy_filter architecture. AutoModelForTokenClassification returns token logits; the helper below greedily decodes BIOES labels into character spans.
import torch
from transformers import AutoModelForTokenClassification, AutoTokenizer
def decode_bioes_spans(text, offsets, label_ids, scores, id2label):
spans = []
current = None
def emit(span):
if span is None:
return
start = span["start"]
end = span["end"]
while start < end and text[start].isspace():
start += 1
while end > start and text[end - 1].isspace():
end -= 1
if end <= start:
return
span_scores = span["scores"]
spans.append(
{
"entity_group": span["entity_group"],
"score": sum(span_scores) / len(span_scores),
"word": text[start:end],
"start": start,
"end": end,
}
)
for offset, label_id, score in zip(offsets, label_ids, scores):
start, end = int(offset[0]), int(offset[1])
if end <= start:
continue
label = id2label[int(label_id)]
if label == "O":
emit(current)
current = None
continue
prefix, entity_group = label.split("-", 1) if "-" in label else ("S", label)
if prefix == "S":
emit(current)
emit(
{
"entity_group": entity_group,
"start": start,
"end": end,
"scores": [float(score)],
}
)
current = None
continue
if prefix == "B" or current is None or current["entity_group"] != entity_group:
emit(current)
current = {
"entity_group": entity_group,
"start": start,
"end": end,
"scores": [float(score)],
}
continue
current["end"] = end
current["scores"].append(float(score))
if prefix == "E":
emit(current)
current = None
emit(current)
return spans
model_id = 'PITTI/privacy-filter-nemotron'
texts = ['John works at Apple in California.', 'Microsoft was founded by Bill Gates.']
tokenizer = AutoTokenizer.from_pretrained(model_id, fix_mistral_regex=True)
model = AutoModelForTokenClassification.from_pretrained(model_id)
model.eval()
encoded = tokenizer(
texts,
return_tensors="pt",
padding=True,
truncation=True,
return_offsets_mapping=True,
)
offset_mapping = encoded.pop("offset_mapping")
with torch.no_grad():
logits = model(**encoded).logits
probabilities = torch.softmax(logits, dim=-1)
label_ids = probabilities.argmax(dim=-1)
label_scores = probabilities.max(dim=-1).values
for text, offsets, ids, scores in zip(
texts,
offset_mapping.tolist(),
label_ids.tolist(),
label_scores.tolist(),
):
print(f"Text: {text}")
print("Grouped spans:")
spans = decode_bioes_spans(text, offsets, ids, scores, model.config.id2label)
for span in spans:
print(
f" {span['entity_group']}: {span['word']!r} "
f"[{span['start']}, {span['end']}] score={span['score']:.3f}"
)
print()
token-classificationOpenMed/privacy-filter-nemotron, an amazing project led by Maziyar Panahi
Base model
openai/privacy-filter