Upload magic-bert-50m-classification model files
Browse files- README.md +23 -28
- model.safetensors +2 -2
README.md
CHANGED
|
@@ -205,32 +205,13 @@ The model classifies files into 106 MIME types across these categories:
|
|
| 205 |
## How to Use
|
| 206 |
|
| 207 |
```python
|
| 208 |
-
from transformers import AutoTokenizer
|
| 209 |
-
from safetensors.torch import load_file
|
| 210 |
import torch
|
| 211 |
-
import json
|
| 212 |
-
|
| 213 |
-
# Load tokenizer and MIME mapping
|
| 214 |
-
tokenizer = AutoTokenizer.from_pretrained("path/to/magic-bert-50m-classification")
|
| 215 |
-
with open("path/to/magic-bert-50m-classification/mime_type_mapping.json") as f:
|
| 216 |
-
mime_mapping = json.load(f)
|
| 217 |
-
id_to_mime = {int(k): v for k, v in mime_mapping.items()}
|
| 218 |
-
|
| 219 |
-
# Load model
|
| 220 |
-
from modeling_magic_bert import MagicBERTForSequenceClassification
|
| 221 |
-
from configuration_magic_bert import MagicBERTConfig
|
| 222 |
-
|
| 223 |
-
config = MagicBERTConfig.from_pretrained("path/to/magic-bert-50m-classification")
|
| 224 |
-
model = MagicBERTForSequenceClassification(config)
|
| 225 |
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
# Load contrastive head weights
|
| 231 |
-
contrastive_dict = load_file("path/to/magic-bert-50m-classification/contrastive_head.safetensors")
|
| 232 |
-
model.projection.load_state_dict({k.replace("projection.", ""): v for k, v in contrastive_dict.items() if "projection" in k})
|
| 233 |
-
model.classifier.load_state_dict({k.replace("classifier.", ""): v for k, v in contrastive_dict.items() if "classifier" in k})
|
| 234 |
|
| 235 |
model.eval()
|
| 236 |
|
|
@@ -240,28 +221,42 @@ with open("example.pdf", "rb") as f:
|
|
| 240 |
|
| 241 |
# Decode bytes to string using latin-1 (preserves all byte values 0-255)
|
| 242 |
text = data.decode("latin-1")
|
| 243 |
-
|
| 244 |
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
|
| 245 |
|
| 246 |
with torch.no_grad():
|
| 247 |
outputs = model(**inputs)
|
| 248 |
predicted_id = outputs.logits.argmax(-1).item()
|
|
|
|
| 249 |
|
| 250 |
-
print(f"Predicted
|
|
|
|
| 251 |
```
|
| 252 |
|
| 253 |
### Getting Embeddings for Similarity Search
|
| 254 |
|
| 255 |
```python
|
| 256 |
-
# Get normalized
|
| 257 |
with torch.no_grad():
|
| 258 |
embeddings = model.get_embeddings(inputs["input_ids"], inputs["attention_mask"])
|
| 259 |
-
# embeddings shape: [batch_size, 256]
|
| 260 |
|
| 261 |
# Compute cosine similarity between files
|
| 262 |
similarity = torch.mm(embeddings1, embeddings2.T)
|
| 263 |
```
|
| 264 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 265 |
## Limitations
|
| 266 |
|
| 267 |
1. **Position bias:** Best performance when content starts at position 0. Accuracy degrades for content at higher offsets.
|
|
|
|
| 205 |
## How to Use
|
| 206 |
|
| 207 |
```python
|
| 208 |
+
from transformers import AutoModelForSequenceClassification, AutoTokenizer
|
|
|
|
| 209 |
import torch
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 210 |
|
| 211 |
+
model = AutoModelForSequenceClassification.from_pretrained(
|
| 212 |
+
"mjbommar/magic-bert-50m-classification", trust_remote_code=True
|
| 213 |
+
)
|
| 214 |
+
tokenizer = AutoTokenizer.from_pretrained("mjbommar/magic-bert-50m-classification")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 215 |
|
| 216 |
model.eval()
|
| 217 |
|
|
|
|
| 221 |
|
| 222 |
# Decode bytes to string using latin-1 (preserves all byte values 0-255)
|
| 223 |
text = data.decode("latin-1")
|
|
|
|
| 224 |
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
|
| 225 |
|
| 226 |
with torch.no_grad():
|
| 227 |
outputs = model(**inputs)
|
| 228 |
predicted_id = outputs.logits.argmax(-1).item()
|
| 229 |
+
confidence = torch.softmax(outputs.logits, dim=-1).max().item()
|
| 230 |
|
| 231 |
+
print(f"Predicted class: {predicted_id}")
|
| 232 |
+
print(f"Confidence: {confidence:.2%}")
|
| 233 |
```
|
| 234 |
|
| 235 |
### Getting Embeddings for Similarity Search
|
| 236 |
|
| 237 |
```python
|
| 238 |
+
# Get normalized embeddings (256-dim, L2-normalized)
|
| 239 |
with torch.no_grad():
|
| 240 |
embeddings = model.get_embeddings(inputs["input_ids"], inputs["attention_mask"])
|
| 241 |
+
# embeddings shape: [batch_size, 256]
|
| 242 |
|
| 243 |
# Compute cosine similarity between files
|
| 244 |
similarity = torch.mm(embeddings1, embeddings2.T)
|
| 245 |
```
|
| 246 |
|
| 247 |
+
### Loading MIME Type Labels
|
| 248 |
+
|
| 249 |
+
```python
|
| 250 |
+
from huggingface_hub import hf_hub_download
|
| 251 |
+
import json
|
| 252 |
+
|
| 253 |
+
mime_path = hf_hub_download("mjbommar/magic-bert-50m-classification", "mime_type_mapping.json")
|
| 254 |
+
with open(mime_path) as f:
|
| 255 |
+
id_to_mime = {int(k): v for k, v in json.load(f).items()}
|
| 256 |
+
|
| 257 |
+
print(f"Predicted MIME type: {id_to_mime[predicted_id]}")
|
| 258 |
+
```
|
| 259 |
+
|
| 260 |
## Limitations
|
| 261 |
|
| 262 |
1. **Position bias:** Best performance when content starts at position 0. Accuracy degrades for content at higher offsets.
|
model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:93b0ae348f5fac2a4eac22b83b2540fd21bb0b45e4d308c26d9f849ebc1ebd22
|
| 3 |
+
size 170737312
|