Density-aware Soft Context Compression with Semi-Dynamic Compression Ratio
Paper • 2603.25926 • Published • 8
This Hugging Face repository hosts fine-tuned LoRA checkpoints for the models introduced in the paper Density-aware Soft Context Compression with Semi-Dynamic Compression Ratio.
These models compress long documents into latent tokens and inject them into the decoder at placeholder positions. This framework features a Discrete Ratio Selector that predicts a compression target based on the intrinsic information density of the input, outperforming static baselines and establishing a robust Pareto frontier for context compression.
To use these weights, you need the modeling_ctxcomp.py file from the official repository.
In this mode, the model predicts how many compressed tokens to use. Use exactly one placeholder in the prompt.
import os
import sys
import torch
import json
from transformers import AutoTokenizer
# Ensure modeling_ctxcomp.py is in your path
# sys.path.insert(0, "/path/to/semi_dynamic_soft_context_compress")
from modeling_ctxcomp import CtxCompSemiDynamicModel
checkpoint_dir = "yuyijiong/qwen3-semi-dynamic-soft-context-compress"
with open(os.path.join(checkpoint_dir, "config.json"), "r") as f:
config = json.load(f)
encoder_path = config.get("base_encoder_model_path")
decoder_path = config.get("base_decoder_model_path")
placeholder_token_id = config["placeholder_token_id"]
tokenizer_encoder = AutoTokenizer.from_pretrained(encoder_path, trust_remote_code=True, padding_side="left")
tokenizer_decoder = AutoTokenizer.from_pretrained(decoder_path, trust_remote_code=True)
placeholder_token = tokenizer_decoder.convert_ids_to_tokens(placeholder_token_id)
model = CtxCompSemiDynamicModel.from_pretrained(
checkpoint_dir,
trust_remote_code=True,
torch_dtype=torch.bfloat16,
device_map="auto",
).eval()
document = "Your long document text here..."
question = "What is the main idea of the document?"
# One placeholder; model predicts M and expands internally
prompt_text = f"Context: {placeholder_token}
Question: {question}"
doc_encoded = tokenizer_encoder(document, return_tensors="pt", padding=True, truncation=True, max_length=2048)
doc_input_ids = doc_encoded["input_ids"].to(model.decoder.device)
doc_attention_mask = doc_encoded["attention_mask"].to(model.decoder.device)
messages = [{"role": "user", "content": prompt_text}]
prompt_ids = tokenizer_decoder.apply_chat_template(
messages, tokenize=True, add_generation_prompt=True, return_tensors="pt",
)
input_ids = prompt_ids.to(model.decoder.device)
attention_mask = torch.ones_like(input_ids, device=input_ids.device)
# compress_ratio_scale: shift predicted ratio (e.g. 0.5 = more compression, -0.5 = less)
compress_ratio_scale = 0.5
with torch.no_grad():
out = model.generate(
doc_input_ids=doc_input_ids,
input_ids=input_ids,
doc_attention_mask=doc_attention_mask,
attention_mask=attention_mask,
compress_ratio_scale=compress_ratio_scale,
max_new_tokens=256,
do_sample=False,
)
generated_ids = out[0][input_ids.shape[1]:]
answer = tokenizer_decoder.decode(generated_ids, skip_special_tokens=True)
print(answer)
@article{yuyijiong2026density,
title={Density-aware Soft Context Compression with Semi-Dynamic Compression Ratio},
author={Yuyi Jiong and others},
journal={arXiv preprint arXiv:2603.25926},
year={2026}
}