mjbommar commited on
Commit
b93fce0
·
verified ·
1 Parent(s): 383b8ea

Upload magic-bert-50m-classification model files

Browse files
Files changed (2) hide show
  1. README.md +23 -28
  2. model.safetensors +2 -2
README.md CHANGED
@@ -205,32 +205,13 @@ The model classifies files into 106 MIME types across these categories:
205
  ## How to Use
206
 
207
  ```python
208
- from transformers import AutoTokenizer
209
- from safetensors.torch import load_file
210
  import torch
211
- import json
212
-
213
- # Load tokenizer and MIME mapping
214
- tokenizer = AutoTokenizer.from_pretrained("path/to/magic-bert-50m-classification")
215
- with open("path/to/magic-bert-50m-classification/mime_type_mapping.json") as f:
216
- mime_mapping = json.load(f)
217
- id_to_mime = {int(k): v for k, v in mime_mapping.items()}
218
-
219
- # Load model
220
- from modeling_magic_bert import MagicBERTForSequenceClassification
221
- from configuration_magic_bert import MagicBERTConfig
222
-
223
- config = MagicBERTConfig.from_pretrained("path/to/magic-bert-50m-classification")
224
- model = MagicBERTForSequenceClassification(config)
225
 
226
- # Load base model weights
227
- state_dict = load_file("path/to/magic-bert-50m-classification/model.safetensors")
228
- model.load_state_dict(state_dict, strict=False)
229
-
230
- # Load contrastive head weights
231
- contrastive_dict = load_file("path/to/magic-bert-50m-classification/contrastive_head.safetensors")
232
- model.projection.load_state_dict({k.replace("projection.", ""): v for k, v in contrastive_dict.items() if "projection" in k})
233
- model.classifier.load_state_dict({k.replace("classifier.", ""): v for k, v in contrastive_dict.items() if "classifier" in k})
234
 
235
  model.eval()
236
 
@@ -240,28 +221,42 @@ with open("example.pdf", "rb") as f:
240
 
241
  # Decode bytes to string using latin-1 (preserves all byte values 0-255)
242
  text = data.decode("latin-1")
243
-
244
  inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
245
 
246
  with torch.no_grad():
247
  outputs = model(**inputs)
248
  predicted_id = outputs.logits.argmax(-1).item()
 
249
 
250
- print(f"Predicted MIME type: {id_to_mime[predicted_id]}")
 
251
  ```
252
 
253
  ### Getting Embeddings for Similarity Search
254
 
255
  ```python
256
- # Get normalized projection embeddings
257
  with torch.no_grad():
258
  embeddings = model.get_embeddings(inputs["input_ids"], inputs["attention_mask"])
259
- # embeddings shape: [batch_size, 256], L2 normalized
260
 
261
  # Compute cosine similarity between files
262
  similarity = torch.mm(embeddings1, embeddings2.T)
263
  ```
264
 
 
 
 
 
 
 
 
 
 
 
 
 
 
265
  ## Limitations
266
 
267
  1. **Position bias:** Best performance when content starts at position 0. Accuracy degrades for content at higher offsets.
 
205
  ## How to Use
206
 
207
  ```python
208
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer
 
209
  import torch
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210
 
211
+ model = AutoModelForSequenceClassification.from_pretrained(
212
+ "mjbommar/magic-bert-50m-classification", trust_remote_code=True
213
+ )
214
+ tokenizer = AutoTokenizer.from_pretrained("mjbommar/magic-bert-50m-classification")
 
 
 
 
215
 
216
  model.eval()
217
 
 
221
 
222
  # Decode bytes to string using latin-1 (preserves all byte values 0-255)
223
  text = data.decode("latin-1")
 
224
  inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
225
 
226
  with torch.no_grad():
227
  outputs = model(**inputs)
228
  predicted_id = outputs.logits.argmax(-1).item()
229
+ confidence = torch.softmax(outputs.logits, dim=-1).max().item()
230
 
231
+ print(f"Predicted class: {predicted_id}")
232
+ print(f"Confidence: {confidence:.2%}")
233
  ```
234
 
235
  ### Getting Embeddings for Similarity Search
236
 
237
  ```python
238
+ # Get normalized embeddings (256-dim, L2-normalized)
239
  with torch.no_grad():
240
  embeddings = model.get_embeddings(inputs["input_ids"], inputs["attention_mask"])
241
+ # embeddings shape: [batch_size, 256]
242
 
243
  # Compute cosine similarity between files
244
  similarity = torch.mm(embeddings1, embeddings2.T)
245
  ```
246
 
247
+ ### Loading MIME Type Labels
248
+
249
+ ```python
250
+ from huggingface_hub import hf_hub_download
251
+ import json
252
+
253
+ mime_path = hf_hub_download("mjbommar/magic-bert-50m-classification", "mime_type_mapping.json")
254
+ with open(mime_path) as f:
255
+ id_to_mime = {int(k): v for k, v in json.load(f).items()}
256
+
257
+ print(f"Predicted MIME type: {id_to_mime[predicted_id]}")
258
+ ```
259
+
260
  ## Limitations
261
 
262
  1. **Position bias:** Best performance when content starts at position 0. Accuracy degrades for content at higher offsets.
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a3923cd4384639bde231f53f2b40822cc71fdc920d43bf4b97a5b6edafad3d2c
3
- size 236291992
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:93b0ae348f5fac2a4eac22b83b2540fd21bb0b45e4d308c26d9f849ebc1ebd22
3
+ size 170737312