import traceback import time from typing import List, Dict, Optional from concurrent.futures import ThreadPoolExecutor, as_completed from threading import Lock from typing import List, Dict, Any import json from rich.console import Console try: from .ai import complete except ImportError: from ai import complete print = Console().log paper_types: Dict[str, str] = { "CV": "computer vision, any paper that deals with image, video, point cloud or 3D model data", "NLP": "natural language processing", "LLM": "large language model, if a paper has this label, also include the NLP label", "RO": "robotics", "ML": "machine learning, use this label only when the paper is not specific to any of the above categories", } type_sort_order = ["ML", "CV", "NLP", "LLM", "RO"] system_prompt = """ You are a professional AI researcher. You are helping users to organize their papers. """.strip() user_prompt = "\n\n".join([ """ You are given a list of papers. You need to classify each paper into one of the following categories: """.strip(), chr(10).join(map(lambda x: f"- {x}: {paper_types[x]}", paper_types.keys())), """ You will be given several papers at a time. For each paper, you need to classify it into one of the categories above. You should output in the following format with a code block: """.strip(), """ ```json [ { "id": "2402.01032", "category": ["RO"] }, { "id": "2402.03254", "category": ["ML"] }, { "id": "2403.00043", "category": ["LLM", "NLP"] } ] ``` Do not add any additional information in the output. The order of the papers in the output should match the order of the papers in the input. """.strip(), """ The followings are the papers you need to classify: """ ]) def build_paper(id: str, title: str, abstract: str = None) -> str: if abstract is None: return f"{id}: {title}" return f"{id}: {title}\n\n{abstract}" def get_classify_prompt(papers: List[Dict[str, str]]) -> str: prompt = [] for index, paper in enumerate(papers, start=1): prompt.append(build_paper(paper["id"], paper["title"], paper["abstract"] if "abstract" in paper else None)) return user_prompt + "\n\n" + "\n\n".join(prompt) def parse_response(response: str) -> List[Dict[str, List[str]]] | None: print(response) # 匹配code block response = response.strip() if not response.startswith("```") or not response.endswith("```"): return None # 删除response的第一行和最后一行 response = "\n".join(response.split("\n")[1:-1]) try: data = json.loads(response) except json.JSONDecodeError: print(response) return None for paper in data: if "id" not in paper or "category" not in paper: return None if not isinstance(paper["id"], str) or not isinstance(paper["category"], list): return None for category in paper["category"]: if category not in paper_types: return None # sort the categories for paper in data: if "LLM" in paper["category"] and "NLP" not in paper["category"]: paper["category"].append("NLP") # sort the categories for paper in data: paper["category"].sort(key=lambda x: type_sort_order.index(x)) return data def get_classification(papers: List[Dict[str, str]]) -> List[Dict[str, List[str]]] | None: prompt = get_classify_prompt(papers) for _ in range(3): try: print("Request sent") response = complete([{"role": "system", "content": system_prompt}, {"role": "user", "content": prompt}]) if response is not None: return parse_response(response) except: pass class TokenBucket: def __init__(self): self.tokens = 200 self.last_update = time.time() self.lock = Lock() def acquire(self, n: int) -> bool: with self.lock: current_time = time.time() elapsed = current_time - self.last_update if elapsed >= 60: added_tokens = int(elapsed // 60) * 200 self.tokens = min(self.tokens + added_tokens, 200) self.last_update += 60 * (elapsed // 60) if self.tokens >= n: self.tokens -= n return True return False token_bucket = TokenBucket() class PaperCache: def __init__(self): self.cache = {} self.lock = Lock() def get(self, paper): key = paper["id"] with self.lock: data = self.cache.get(key) if data is not None: print(f"Cache hit for {paper['id']}") return data print(f"Cache miss for {paper['id']}") return None def set(self, paper, result): print(f"Setting cache for {paper['id']}") key = paper["id"] with self.lock: self.cache[key] = result paper_cache = PaperCache() def classify_papers(papers: List[Dict[str, str]]) -> Optional[List[Dict[str, List[str]]]]: print(f"Classifying {len(papers)} papers") cached_results = [] uncached_papers = [] # 为 papers 添加index for index, paper in enumerate(papers, start=1): paper['index'] = index for paper in papers: cached_result = paper_cache.get(paper) if cached_result is not None: cached_results.append(cached_result) else: uncached_papers.append(paper) # 输出 cache 的数量 print(f"Cache hit: {len(cached_results)}, Cache miss: {len(uncached_papers)}") if not uncached_papers: return cached_results batches = [uncached_papers[i:i+10] for i in range(0, len(uncached_papers), 10)] num_batches = len(batches) if num_batches == 0: return cached_results if not token_bucket.acquire(num_batches): return None try: with ThreadPoolExecutor(max_workers=10) as executor: futures = [executor.submit(get_classification, batch) for batch in batches] results = [] for future in as_completed(futures): batch_result = future.result() if batch_result is None: for f in futures: f.cancel() return None results.extend(batch_result) print(results) results.sort(key=lambda x: x['id']) for result in results: paper_cache.set(result, result) return cached_results + results except Exception as e: print(traceback.format_exc()) return None if __name__ == "__main__": from rich import print import time start = time.time() print(classify_papers([ { "title": "OmniHuman-1: Rethinking the Scaling-Up of One-Stage Conditioned Human Animation Models", "abstract": "End-to-end human animation, such as audio-driven talking human generation, has undergone notable advancements in the recent few years. However, existing methods still struggle to scale up as large general video generation models, limiting their potential in real applications. In this paper, we propose OmniHuman, a Diffusion Transformer-based framework that scales up data by mixing motion-related conditions into the training phase. To this end, we introduce two training principles for these mixed conditions, along with the corresponding model architecture and inference strategy. These designs enable OmniHuman to fully leverage data-driven motion generation, ultimately achieving highly realistic human video generation. More importantly, OmniHuman supports various portrait contents (face close-up, portrait, half-body, full-body), supports both talking and singing, handles human-object interactions and challenging body poses, and accommodates different image styles. Compared to existing end-to-end audio-driven methods, OmniHuman not only produces more realistic videos, but also offers greater flexibility in inputs. It also supports multiple driving modalities (audio-driven, video-driven and combined driving signals). Video samples are provided on the ttfamily project page (https://omnihuman-lab.github.io)" }, { "title": "SmolLM2: When Smol Goes Big -- Data-Centric Training of a Small Language Model", "abstract": "While large language models have facilitated breakthroughs in many applications of artificial intelligence, their inherent largeness makes them computationally expensive and challenging to deploy in resource-constrained settings. In this paper, we document the development of SmolLM2, a state-of-the-art \"small\" (1.7 billion parameter) language model (LM). To attain strong performance, we overtrain SmolLM2 on ~11 trillion tokens of data using a multi-stage training process that mixes web text with specialized math, code, and instruction-following data. We additionally introduce new specialized datasets (FineMath, Stack-Edu, and SmolTalk) at stages where we found existing datasets to be problematically small or low-quality. To inform our design decisions, we perform both small-scale ablations as well as a manual refinement process that updates the dataset mixing rates at each stage based on the performance at the previous stage. Ultimately, we demonstrate that SmolLM2 outperforms other recent small LMs including Qwen2.5-1.5B and Llama3.2-1B. To facilitate future research on LM development as well as applications of small LMs, we release both SmolLM2 as well as all of the datasets we prepared in the course of this project." }, { "title": "Generating Multi-Image Synthetic Data for Text-to-Image Customization", "abstract": "Customization of text-to-image models enables users to insert custom concepts and generate the concepts in unseen settings. Existing methods either rely on costly test-time optimization or train encoders on single-image training datasets without multi-image supervision, leading to worse image quality. We propose a simple approach that addresses both limitations. We first leverage existing text-to-image models and 3D datasets to create a high-quality Synthetic Customization Dataset (SynCD) consisting of multiple images of the same object in different lighting, backgrounds, and poses. We then propose a new encoder architecture based on shared attention mechanisms that better incorporate fine-grained visual details from input images. Finally, we propose a new inference technique that mitigates overexposure issues during inference by normalizing the text and image guidance vectors. Through extensive experiments, we show that our model, trained on the synthetic dataset with the proposed encoder and inference algorithm, outperforms existing tuning-free methods on standard customization benchmarks.", }, {'title': 's1: Simple test-time scaling'}, {'title': 'Reward-Guided Speculative Decoding for Efficient LLM Reasoning'}, {'title': 'MatAnyone: Stable Video Matting with Consistent Memory Propagation'}, {'title': 'Self-supervised Quantized Representation for Seamlessly Integrating Knowledge Graphs with Large Language Models'}, {'title': 'Scalable-Softmax Is Superior for Attention'}, {'title': 'PixelWorld: Towards Perceiving Everything as Pixels'}, {'title': 'DINO-WM: World Models on Pre-trained Visual Features enable Zero-shot Planning'}, {'title': 'Constitutional Classifiers: Defending against Universal Jailbreaks across Thousands of Hours of Red Teaming'}, {'title': 'SAeUron: Interpretable Concept Unlearning in Diffusion Models with Sparse Autoencoders'}, {'title': 'Zero-Shot Novel View and Depth Synthesis with Multi-View Geometric Diffusion'}, {'title': 'The Surprising Agreement Between Convex Optimization Theory and Learning-Rate Scheduling for Large Model Training'}, {'title': 'Fast Encoder-Based 3D from Casual Videos via Point Track Processing'}, {'title': 'Unraveling the Capabilities of Language Models in News Summarization'}, {'title': 'Trading Inference-Time Compute for Adversarial Robustness'}, {'title': 'INT: Instance-Specific Negative Mining for Task-Generic Promptable Segmentation'}, {'title': 'ChunkKV: Semantic-Preserving KV Cache Compression for Efficient Long-Context LLM Inference'}, {'title': 'VideoJAM: Joint Appearance-Motion Representations for Enhanced Motion Generation in Video Models'}, {'title': 'Inverse Bridge Matching Distillation'}, {'title': 'ACECODER: Acing Coder RL via Automated Test-Case Synthesis'}, {'title': 'Satori: Reinforcement Learning with Chain-of-Action-Thought Enhances LLM Reasoning via Autoregressive Search'}, {'title': 'QLASS: Boosting Language Agent Inference via Q-Guided Stepwise Search'}, {'title': 'Concept Steerers: Leveraging K-Sparse Autoencoders for Controllable Generations'}, {'title': 'Can LLMs Maintain Fundamental Abilities under KV Cache Compression?'}, {'title': 'Rethinking Mixture-of-Agents: Is Mixing Different Large Language Models Beneficial?'}, {'title': 'COCONut-PanCap: Joint Panoptic Segmentation and Grounded Captions for Fine-Grained Understanding and Generation'}, {'title': 'Text-to-CAD Generation Through Infusing Visual Feedback in Large Language Models'}, {'title': 'Generating Multi-Image Synthetic Data for Text-to-Image Customization'}, {'title': 'Sample, Scrutinize and Scale: Effective Inference-Time Search by Scaling Verification'}, {'title': 'Federated Sketching LoRA: On-Device Collaborative Fine-Tuning of Large Language Models'}, {'title': 'Activation Approximations Can Incur Safety Vulnerabilities Even in Aligned LLMs: Comprehensive Analysis and Defense'}, {'title': 'SmolLM2: When Smol Goes Big -- Data-Centric Training of a Small Language Model'}, {'title': 'Demystifying Long Chain-of-Thought Reasoning in LLMs'}, {'title': 'LIMO: Less is More for Reasoning'}, {'title': 'TwinMarket: A Scalable Behavioral and Social Simulation for Financial Markets'}, {'title': 'LayerTracer: Cognitive-Aligned Layered SVG Synthesis via Diffusion Transformer'}, {'title': 'Boosting Multimodal Reasoning with MCTS-Automated Structured Thinking'}, {'title': 'Token Assorted: Mixing Latent and Text Tokens for Improved Language Model Reasoning'}, {'title': 'Large Language Model Guided Self-Debugging Code Generation'}, {'title': 'Jailbreaking with Universal Multi-Prompts'}, {'title': 'A Probabilistic Inference Approach to Inference-Time Scaling of LLMs using Particle-Based Monte Carlo Methods'}, {'title': 'On Teacher Hacking in Language Model Distillation'}, {'title': 'Riddle Me This! Stealthy Membership Inference for Retrieval-Augmented Generation'}, {'title': 'Activation-Informed Merging of Large Language Models'}, {'title': 'HackerRank-ASTRA: Evaluating Correctness & Consistency of Large Language Models on cross-domain multi-file project problems'}, {'title': 'Analyze Feature Flow to Enhance Interpretation and Steering in Language Models'}, {'title': 'Gold-medalist Performance in Solving Olympiad Geometry with AlphaGeometry2'}, {'title': 'DynVFX: Augmenting Real Videos with Dynamic Content'}, {'title': 'UltraIF: Advancing Instruction Following from the Wild'}, {'title': 'Great Models Think Alike and this Undermines AI Oversight'}, {'title': 'ConceptAttention: Diffusion Transformers Learn Highly Interpretable Features'}, {'title': 'Ola: Pushing the Frontiers of Omni-Modal Language Model with Progressive Modality Alignment'}, {'title': 'Weak-to-Strong Diffusion with Reflection'}, {'title': 'MAGA: MAssive Genre-Audience Reformulation to Pretraining Corpus Expansion'}, {'title': 'MotionLab: Unified Human Motion Generation and Editing via the Motion-Condition-Motion Paradigm'}, {'title': 'BOLT: Bootstrap Long Chain-of-Thought in Language Models without Distillation'}, {'title': 'ScoreFlow: Mastering LLM Agent Workflows via Score-based Preference Optimization'}, {'title': 'Llasa: Scaling Train-Time and Inference-Time Compute for Llama-based Speech Synthesis'}, {'title': 'MotionCanvas: Cinematic Shot Design with Controllable Image-to-Video Generation'}, {'title': 'PILAF: Optimal Human Preference Sampling for Reward Modeling'}, {'title': 'Beyond Prompt Content: Enhancing LLM Performance via Content-Format Integrated Prompt Optimization'}, {'title': 'ChartCitor: Multi-Agent Framework for Fine-Grained Chart Visual Attribution'}, {'title': 'Towards Physical Understanding in Video Generation: A 3D Point Regularization Approach'}, {'title': 'PlotGen: Multi-Agent LLM-based Scientific Data Visualization via Multimodal Feedback'}, {'title': 'Enhancing Code Generation for Low-Resource Languages: No Silver Bullet'}, {'title': 'Learning Real-World Action-Video Dynamics with Heterogeneous Masked Autoregression'}, {'title': 'Speak Easy: Eliciting Harmful Jailbreaks from LLMs with Simple Interactions'}, {'title': 'OmniHuman-1: Rethinking the Scaling-Up of One-Stage Conditioned Human Animation Models'}, {'title': 'The Differences Between Direct Alignment Algorithms are a Blur'}, {'title': 'Process Reinforcement through Implicit Rewards'}, {'title': 'Preference Leakage: A Contamination Problem in LLM-as-a-judge'}, {'title': 'AlignVLM: Bridging Vision and Language Latent Spaces for Multimodal Understanding'}, {'title': 'SafeRAG: Benchmarking Security in Retrieval-Augmented Generation of Large Language Model'}, {'title': 'SliderSpace: Decomposing the Visual Capabilities of Diffusion Models'}, ])) print(classify_papers([ {'title': 'Federated Sketching LoRA: On-Device Collaborative Fine-Tuning of Large Language Models'}, {'title': 'Activation Approximations Can Incur Safety Vulnerabilities Even in Aligned LLMs: Comprehensive Analysis and Defense'}, {'title': 'SmolLM2: When Smol Goes Big -- Data-Centric Training of a Small Language Model'}, {'title': 'Demystifying Long Chain-of-Thought Reasoning in LLMs'}, {'title': 'LIMO: Less is More for Reasoning'}, {'title': 'TwinMarket: A Scalable Behavioral and Social Simulation for Financial Markets'}, {'title': 'LayerTracer: Cognitive-Aligned Layered SVG Synthesis via Diffusion Transformer'}, {'title': 'Boosting Multimodal Reasoning with MCTS-Automated Structured Thinking'}, {'title': 'Token Assorted: Mixing Latent and Text Tokens for Improved Language Model Reasoning'}, {'title': 'Large Language Model Guided Self-Debugging Code Generation'}, {'title': 'Jailbreaking with Universal Multi-Prompts'}, {'title': 'A Probabilistic Inference Approach to Inference-Time Scaling of LLMs using Particle-Based Monte Carlo Methods'}, {'title': 'On Teacher Hacking in Language Model Distillation'}, {'title': 'Riddle Me This! Stealthy Membership Inference for Retrieval-Augmented Generation'}, {'title': 'Activation-Informed Merging of Large Language Models'}, {'title': 'HackerRank-ASTRA: Evaluating Correctness & Consistency of Large Language Models on cross-domain multi-file project problems'}, {'title': 'Analyze Feature Flow to Enhance Interpretation and Steering in Language Models'}, {'title': 'Gold-medalist Performance in Solving Olympiad Geometry with AlphaGeometry2'}, {'title': 'DynVFX: Augmenting Real Videos with Dynamic Content'}, {'title': 'UltraIF: Advancing Instruction Following from the Wild'}, {'title': 'Great Models Think Alike and this Undermines AI Oversight'}, {'title': 'ConceptAttention: Diffusion Transformers Learn Highly Interpretable Features'}, {'title': 'Ola: Pushing the Frontiers of Omni-Modal Language Model with Progressive Modality Alignment'}, {'title': 'Weak-to-Strong Diffusion with Reflection'}, {'title': 'MAGA: MAssive Genre-Audience Reformulation to Pretraining Corpus Expansion'}, {'title': 'MotionLab: Unified Human Motion Generation and Editing via the Motion-Condition-Motion Paradigm'}, {'title': 'BOLT: Bootstrap Long Chain-of-Thought in Language Models without Distillation'}, {'title': 'ScoreFlow: Mastering LLM Agent Workflows via Score-based Preference Optimization'}, {'title': 'Llasa: Scaling Train-Time and Inference-Time Compute for Llama-based Speech Synthesis'}, {'title': 'MotionCanvas: Cinematic Shot Design with Controllable Image-to-Video Generation'}, {'title': 'PILAF: Optimal Human Preference Sampling for Reward Modeling'}, {'title': 'Beyond Prompt Content: Enhancing LLM Performance via Content-Format Integrated Prompt Optimization'}, {'title': 'ChartCitor: Multi-Agent Framework for Fine-Grained Chart Visual Attribution'}, {'title': 'Towards Physical Understanding in Video Generation: A 3D Point Regularization Approach'}, {'title': 'PlotGen: Multi-Agent LLM-based Scientific Data Visualization via Multimodal Feedback'}, {'title': 'Enhancing Code Generation for Low-Resource Languages: No Silver Bullet'}, {'title': 'Learning Real-World Action-Video Dynamics with Heterogeneous Masked Autoregression'}, {'title': 'Speak Easy: Eliciting Harmful Jailbreaks from LLMs with Simple Interactions'}, {'title': 'OmniHuman-1: Rethinking the Scaling-Up of One-Stage Conditioned Human Animation Models'}, {'title': 'The Differences Between Direct Alignment Algorithms are a Blur'}, {'title': 'Process Reinforcement through Implicit Rewards'}, {'title': 'Preference Leakage: A Contamination Problem in LLM-as-a-judge'}, {'title': 'AlignVLM: Bridging Vision and Language Latent Spaces for Multimodal Understanding'}, {'title': 'SafeRAG: Benchmarking Security in Retrieval-Augmented Generation of Large Language Model'}, {'title': 'SliderSpace: Decomposing the Visual Capabilities of Diffusion Models'}, {'title': 'MM-IQ: Benchmarking Human-Like Abstraction and Reasoning in Multimodal Models'}, {'title': 'DeepRAG: Thinking to Retrieval Step by Step for Large Language Models'}, {'title': 'Scaling Embedding Layers in Language Models'}, {'title': 'MakeAnything: Harnessing Diffusion Transformers for Multi-Domain Procedural Sequence Generation'}, {'title': 'AIN: The Arabic INclusive Large Multimodal Model'}, {'title': 'FastKV: KV Cache Compression for Fast Long-Context Processing with Token-Selective Propagation'}, {'title': 'ZebraLogic: On the Scaling Limits of LLMs for Logical Reasoning'}, {'title': 'The Jumping Reasoning Curve? Tracking the Evolution of Reasoning Performance in GPT-[n] and o-[n] Models on Multimodal Puzzles'}, {'title': 'RandLoRA: Full-rank parameter-efficient fine-tuning of large models'}, {'title': 'Almost Surely Safe Alignment of Large Language Models at Inference-Time'}, {'title': 'Improving Transformer World Models for Data-Efficient RL'}, {'title': 'PhD Knowledge Not Required: A Reasoning Challenge for Large Language Models'}, {'title': 'Improved Training Technique for Latent Consistency Models'}, {'title': 'LongDPO: Unlock Better Long-form Generation Abilities for LLMs via Critique-augmented Stepwise Information'}, {'title': 'Learning to Generate Unit Tests for Automated Debugging'}, {'title': 'Lifelong Sequential Knowledge Editing without Model Degradation'}, {'title': 'A Study on the Performance of U-Net Modifications in Retroperitoneal Tumor Segmentation'}, {'title': 'Language Models Prefer What They Know: Relative Confidence Estimation via Confidence Preferences'}, {'title': 'Current Pathology Foundation Models are unrobust to Medical Center Differences'} ])) print(f"Time taken: {time.time() - start}")