[
  {
    "title": "Activation Steering via Generative Causal Mediation",
    "authors": "Aruna Sankaranarayanan, Amir Zur, Atticus Geiger, Dylan Hadfield-Menell",
    "venue": "ICLR 2026",
    "year": 2026,
    "url": "https://arxiv.org/abs/2602.16080",
    "description": "Researchers tackle the challenge of pinpointing and controlling specific behaviors in language models, which can be dispersed throughout lengthy responses. By developing a new approach, they aim to provide more precise control over these models, enabling more effective intervention and modification of their outputs. This work has implications for improving the reliability and trustworthiness of language models in real-world applications.",
    "category": "uses_nnsight",
    "image": "/images/Activation-Steering-via-Generative-Causal-Mediation.png"
  },
  {
    "title": "ADAG: Automatically Describing Attribution Graphs",
    "authors": "Aryaman Arora, Zhengxuan Wu, Jacob Steinhardt, Sarah Schwettmann",
    "venue": "ArXiv 2026",
    "year": 2026,
    "url": "https://arxiv.org/abs/2604.07615",
    "description": "Researchers have developed a new automated pipeline, ADAG, to explain how language models work by identifying the internal features that contribute to specific outputs. This approach eliminates the need for manual interpretation, instead using attribution profiles and clustering algorithms to quantify the role of each feature. By providing a more transparent understanding of language model computations, ADAG can help uncover and address potential issues, such as biased or harmful outputs.",
    "category": "referencing",
    "image": "/images/ADAG-Automatically-Describing-Attribution-Graphs.png"
  },
  {
    "title": "Behind the Scenes: Mechanistic Interpretability of LoRA-adapted Whisper for Speech Emotion Recognition",
    "authors": "Yujian Ma, Jinqiu Sang, Ruizhe Li",
    "venue": "ArXiv 2026",
    "year": 2026,
    "url": "https://ieeexplore.ieee.org/abstract/document/11464049/",
    "description": "Conducts the first systematic mechanistic interpretability study of LoRA within Whisper encoder for speech emotion recognition. Using layer contribution probing, logit-lens inspection, SVD, and CKA analysis, the authors reveal two key mechanisms: delayed specialization that preserves general features before consolidating task-specific information, and forward alignment with backward differentiation between LoRA matrices. Findings clarify how LoRA reshapes encoder hierarchies, providing insights for efficient adaptation strategies.",
    "category": "uses_nnsight"
  },
  {
    "title": "Can you map it to English? The Role of Cross-Lingual Alignment in the Multilingual Performance of LLMs",
    "authors": "Kartik Ravisankar, HyoJung Han, Sarah Wiegreffe, Marine Carpuat",
    "venue": "EACL 2026",
    "year": 2026,
    "url": "https://doi.org/10.18653/v1/2026.eacl-long.225",
    "description": "Researchers investigate the role of linguistic and cultural biases in machine translation systems, shedding light on how these biases can affect the accuracy and fairness of translations. By examining the intersection of language, culture, and technology, this study aims to promote more inclusive and equitable language technologies. Its findings have implications for the development of more culturally sensitive machine translation systems.",
    "category": "uses_nnsight",
    "image": "/images/Can-you-map-it-to-English-The-Role-of-Cross-Lingual-Alignment-in-the-Multilingual-Performance-of-LLMs.png"
  },
  {
    "title": "Counting Hypothesis: Potential Mechanism of In-Context Learning",
    "authors": "Jung H. Lee, Sujith Vijayan",
    "venue": "ArXiv 2026",
    "year": 2026,
    "url": "https://arxiv.org/abs/2602.01687",
    "description": "Researchers are working to understand how large language models can learn specific tasks from just a few examples, a phenomenon known as In-Context Learning. This ability has the potential to expand the use of these models into new areas, but its underlying mechanisms are still not well understood, making it difficult to correct errors or diagnose issues. By proposing a new hypothesis, the \"counting hypothesis,\" this study aims to shed light on how these models support In-Context Learning.",
    "category": "uses_nnsight",
    "image": "/images/Counting-Hypothesis-Potential-Mechanism-of-In-Context-Learning.png"
  },
  {
    "title": "DFWe: Efficient Knowledge Distillation of Fine-tuned Whisper Encoder for Speech Emotion Recognition",
    "authors": "Y Ma, X Jiang, J Sang, R Li",
    "venue": "Pattern Recognition 2026",
    "year": 2026,
    "url": "https://www.sciencedirect.com/science/article/pii/S0031320326001263",
    "description": "Researchers are exploring ways to adapt powerful pre-trained speech models, like Whisper, to recognize emotions in speech, a task that requires more nuanced understanding than just acoustic modeling. By addressing the limitations of these models in capturing emotional cues, this work aims to improve speech emotion recognition. This advancement could lead to more empathetic and human-like interactions with voice-based systems.",
    "category": "uses_nnsight",
    "image": "/images/DFWe-Efficient-Knowledge-Distillation-of-Fine-tuned-Whisper-Encoder-for-Speech-Emotion-Recognition.png"
  },
  {
    "title": "Disentangling meaning from language in LLM-based machine translation",
    "authors": "Théo Lasnier, Armel Zebaze, Djamé Seddah, Rachel Bawden, Benoît Sagot",
    "venue": "ArXiv 2026",
    "year": 2026,
    "url": "https://arxiv.org/abs/2602.04613",
    "description": "Researchers are working to understand how Large Language Models (LLMs) work by developing new methods for Mechanistic Interpretability (MI), which aims to explain the inner workings of neural networks. This effort is crucial for building trust in AI systems and identifying potential biases or flaws. By shedding light on how LLMs process and generate language, MI can help improve the reliability and transparency of these powerful models.",
    "category": "uses_nnsight",
    "image": "/images/Disentangling-meaning-from-language-in-LLM-based-machine-translation.png"
  },
  {
    "title": "Do Transformers Use their Depth Adaptively? Evidence from a Relational Reasoning Task",
    "authors": "Alicia Curth, Rachel Lawrence, Sushrut Karmalkar, Niranjani Prasad",
    "venue": "ICLR 2026",
    "year": 2026,
    "url": "https://arxiv.org/abs/2604.12426",
    "description": "Researchers explore how transformers, a type of AI model, adapt their processing depth to tackle tasks of varying complexity. By analyzing how these models process information across different layers and tokens, they shed light on the strategies transformers use to solve problems. The study reveals that transformers can adapt their depth to suit the task at hand, especially when fine-tuned for specific tasks, allowing them to efficiently allocate processing resources.",
    "category": "uses_nnsight",
    "image": "/images/Do-Transformers-Use-their-Depth-Adaptively-Evidence-from-a-Relational-Reasoning-Task.png"
  },
  {
    "title": "DreamReader: An Interpretability Toolkit for Text-to-Image Models",
    "authors": "Nirmalendu Prakash, Narmeen Oozeer, Michael Lan, Luka Samkharadze, Phillip Howard, Roy Ka-Wei Lee, Dhruv Nathawani, Shivam Raval, Amirali Abdullah",
    "venue": "ArXiv 2026",
    "year": 2026,
    "url": "https://arxiv.org/abs/2603.13299",
    "description": "Researchers have developed a comprehensive framework called DreamReader to better understand and interpret text-to-image diffusion models, which have become increasingly popular. By providing a unified and model-agnostic approach, DreamReader enables systematic analysis and intervention across different diffusion architectures, allowing for more transparent and controllable model behavior. This framework also draws inspiration from techniques used in language model interpretability, demonstrating the potential for cross-disciplinary approaches to advance our understanding of complex AI models.",
    "category": "referencing",
    "image": "/images/DreamReader-An-Interpretability-Toolkit-for-Text-to-Image-Models.png"
  },
  {
    "title": "Explaining the Explainer: Understanding the Inner Workings of Transformer-based Symbolic Regression Models",
    "authors": "Arco van Breda, Erman Acar",
    "venue": "ArXiv 2026",
    "year": 2026,
    "url": "https://arxiv.org/abs/2602.03506",
    "description": "Researchers have developed a new algorithm called PATCHES to uncover the internal workings of transformers used in symbolic regression, a task that involves generating mathematical operators. By applying PATCHES, they gained a deeper understanding of how these models operate, shedding light on the underlying mechanisms that drive their performance. This breakthrough paves the way for more interpretable and reliable AI models in this domain.",
    "category": "uses_nnsight",
    "image": "/images/Explaining-the-Explainer-Understanding-the-Inner-Workings-of-Transformer-based-Symbolic-Regression-Models.png"
  },
  {
    "title": "Exploring the Limits of Probes for Latent Representation Edits in GPT Models",
    "authors": "Austin L. Davis, Robinson Vasquez Ferrer, Gita Sukthankar",
    "venue": "AI 2026",
    "year": 2026,
    "url": "https://doi.org/10.3390/ai7030092",
    "description": "Researchers explored the use of probing classifiers to modify the internal workings of a chess-playing AI, aiming to understand if the model has a flexible and editable representation of the game. By testing different techniques, they found that some methods are more effective than others in controlling the AI's behavior, shedding light on the model's internal mechanisms. This study has implications for the development of more transparent and controllable AI systems.",
    "category": "uses_nnsight",
    "image": "/images/Exploring-the-Limits-of-Probes-for-Latent-Representation-Edits-in-GPT-Models.png"
  },
  {
    "title": "Fine-Grained Analysis of Shared Syntactic Mechanisms in Language Models",
    "authors": "Ryoma Kumon, Hitomi Yanaka",
    "venue": "ACL 2026",
    "year": 2026,
    "url": "https://arxiv.org/abs/2604.22166",
    "description": "Researchers are working to understand how language models process complex sentence structures, and whether their internal workings align with principles from linguistics. By analyzing specific components of the models, they found that some sentence structures are handled by shared mechanisms, while others are not, shedding light on how these models think about language. This study's findings have implications for improving the performance of language models on tasks that require nuanced understanding of syntax.",
    "category": "uses_nnsight",
    "image": "/images/Fine-Grained-Analysis-of-Shared-Syntactic-Mechanisms-in-Language-Models.png"
  },
  {
    "title": "Fluid Representations in Reasoning Models",
    "authors": "Dmitrii Kharlapenko, Alessandro Stolfo, Arthur Conmy, Mrinmaya Sachan, Zhijing Jin",
    "venue": "ArXiv 2026",
    "year": 2026,
    "url": "https://arxiv.org/abs/2602.04843",
    "description": "Researchers have made strides in understanding how advanced language models process complex information, shedding light on the internal mechanisms that enable them to excel at abstract problem-solving. By analyzing a model trained to produce detailed reasoning traces, they discovered that it develops more effective representations of actions and concepts over time, focusing on underlying structure rather than specific details. This insight has significant implications for improving the performance of language models in complex problem-solving tasks.",
    "category": "referencing",
    "image": "/images/Fluid-Representations-in-Reasoning-Models.png"
  },
  {
    "title": "Friends and Grandmothers in Silico: Localizing Entity Cells in Language Models",
    "authors": "Itay Yona, Dan Barzilay, Michael Karasik, Mor Geva",
    "venue": "ArXiv 2026",
    "year": 2026,
    "url": "https://arxiv.org/abs/2604.01404",
    "description": "Researchers investigated how language models answer factual questions about specific entities, shedding light on the internal mechanisms at play. By analyzing multiple models and testing their responses to various entity-related prompts, they discovered that certain neurons in the models' early layers play a crucial role in storing and retrieving entity-specific information. This finding has implications for understanding and improving the performance of language models in entity-centric question-answering tasks.",
    "category": "uses_nnsight",
    "image": "/images/Friends-and-Grandmothers-in-Silico-Localizing-Entity-Cells-in-Language-Models.png"
  },
  {
    "title": "From Prompts to Patches: A Vocabulary for Bridging Interpretability and Interaction",
    "authors": "I Grabe, J Josua Benjamin, T Jenkins",
    "venue": "ArXiv 2026",
    "year": 2026,
    "url": "https://dl.acm.org/doi/abs/10.1145/3772363.3798909",
    "description": "Researchers challenge the conventional approach to designing interactions with AI systems, which often treats them as human-like agents that respond to natural language prompts. By exploring alternative perspectives, they aim to broaden our understanding of how humans interact with AI and inform more effective design strategies. This shift in thinking can lead to more intuitive and user-friendly AI interfaces.",
    "category": "referencing",
    "image": "/images/From-Prompts-to-Patches-A-Vocabulary-for-Bridging-Interpretability-and-Interaction.png"
  },
  {
    "title": "Geometric Concept Spaces in Small Encoders: A Comparative Mechanistic Probing of ModernBERT and DeBERTa-v3",
    "authors": "C Leo",
    "venue": "ArXiv 2026",
    "year": 2026,
    "url": "https://papers.ssrn.com/sol3/papers.cfm?abstract_id=6486258",
    "description": "Researchers employed a novel tracing technique to gain insight into the inner workings of complex models, allowing them to extract and analyze the hidden states that drive their behavior. By doing so, they shed light on the previously opaque decision-making processes of these models, which is crucial for understanding and improving their performance. This approach has significant implications for the development of more transparent and reliable AI systems.",
    "category": "uses_nnsight"
  },
  {
    "title": "Hierarchical Latent Structures in Data Generation Process Unify Mechanistic Phenomena across Scale",
    "authors": "Jonas Rohweder, Subhabrata Dutta, Iryna Gurevych",
    "venue": "ArXiv 2026",
    "year": 2026,
    "url": "https://arxiv.org/abs/2603.06592",
    "description": "Researchers have been puzzled by the complex behaviors of large language models, and to better understand these phenomena, they need to study how the models learn from their training data. By generating synthetic data that mimics real-world text corpora, this work provides a controlled environment to investigate the emergence of specific patterns in language models. The findings shed light on the crucial role of hierarchical structures in the data generation process, offering a unified explanation for seemingly unrelated phenomena in large language models.",
    "category": "uses_nnsight",
    "image": "/images/Hierarchical-Latent-Structures-in-Data-Generation-Process-Unify-Mechanistic-Phenomena-across-Scale.png"
  },
  {
    "title": "How Open Must Language Models be to Enable Reliable Scientific Inference?",
    "authors": "James A. Michaelov, Catherine Arnett, Tyler A. Chang, Pamela D. Rivière, Samuel M. Taylor, Cameron R. Jones, Sean Trott, Roger P. Levy, Benjamin K. Bergen, Micah Altman",
    "venue": "ArXiv 2026",
    "year": 2026,
    "url": "https://arxiv.org/abs/2603.26539",
    "description": "Researchers examine how the openness of a model affects the reliability of scientific inferences drawn from it, highlighting the limitations of closed models in scientific research. They argue that these limitations can compromise the validity of findings and propose strategies to address these issues. By promoting transparency and accountability in model selection and deployment, scientists can strengthen the foundations of their research.",
    "category": "referencing",
    "image": "/images/How-Open-Must-Language-Models-be-to-Enable-Reliable-Scientific-Inference.png"
  },
  {
    "title": "Jailbreak Strength and Model Similarity Predict Transferability",
    "authors": "Rico Angell, Jannik Brinkmann, He He",
    "venue": "ICLR 2026",
    "year": 2026,
    "url": "https://openreview.net/forum?id=UQK3tUsouK",
    "description": "Investigates what determines when jailbreaks transfer from one model to another, finding that transfer success depends on both jailbreak strength (effectiveness on the source model) and contextual representation similarity between models. The authors show transferability can be increased by distilling from the target model into the source model using only benign prompts, with the distilled model serving as a more effective surrogate for generating transferable attacks, suggesting jailbreak success reflects fundamental flaws in contextual representations.",
    "category": "uses_nnsight",
    "image": "/images/Jailbreak-Strength-and-Model-Similarity-Predict-Transferability.png"
  },
  {
    "title": "LangFIR: Discovering Sparse Language-Specific Features from Monolingual Data for Language Steering",
    "authors": "Sing Hieng Wong, Hassan Sajjad, A. B. Siddique",
    "venue": "ArXiv 2026",
    "year": 2026,
    "url": "https://arxiv.org/abs/2604.03532",
    "description": "Researchers have developed a new method, LangFIR, to improve control over the language of outputs from large language models. By analyzing how these models respond to random sequences of words, LangFIR can identify the specific features that are unique to a particular language, even with limited data. This breakthrough enables more accurate language control in multilingual models, paving the way for more effective and flexible language generation systems.",
    "category": "uses_nnsight",
    "image": "/images/LangFIR-Discovering-Sparse-Language-Specific-Features-from-Monolingual-Data-for-Language-Steering.png"
  },
  {
    "title": "Learning a Generative Meta-Model of LLM Activations",
    "authors": "Grace Luo, Jiahai Feng, Trevor Darrell, Alec Radford, Jacob Steinhardt",
    "venue": "ArXiv 2026",
    "year": 2026,
    "url": "https://arxiv.org/abs/2602.06964",
    "description": "Researchers are developing new methods to analyze neural network activations, moving away from traditional approaches that rely on strong assumptions about the data's structure. By training generative models on large datasets of neural network internal states, they can uncover patterns and relationships without making these assumptions, leading to more accurate and interpretable results. This approach shows promise for improving the understanding and control of complex neural networks.",
    "category": "uses_nnsight",
    "image": "/images/Learning-a-Generative-Meta-Model-of-LLM-Activations.png"
  },
  {
    "title": "Learning State-Tracking from Code Using Linear RNNs",
    "authors": "Julien Siems, Riccardo Grazzi, Kirill Kalinin, Hitesh Ballani, Babak Rahmani",
    "venue": "ArXiv 2026",
    "year": 2026,
    "url": "https://arxiv.org/abs/2602.14814",
    "description": "Researchers have adapted a challenging task, permutation composition, to better align with the training methods used for language models, allowing for a more accurate assessment of sequence models' capabilities. This adaptation reveals that while some models, like linear RNNs, excel at tracking states in this setting, others, such as Transformers, struggle. The study also explores the inherent difficulties of state tracking in code, highlighting the limitations of even successful models when faced with incomplete information.",
    "category": "uses_nnsight",
    "image": "/images/Learning-State-Tracking-from-Code-Using-Linear-RNNs.png"
  },
  {
    "title": "Mathematical Modeling of Common-Pool Resources: A Comprehensive Review of Bioeconomics, Strategic Interaction, and Complex Adaptive Systems",
    "authors": "Zebiao Li, Rui Liu, Chengyi Tu",
    "venue": "ArXiv 2026",
    "year": 2026,
    "url": "https://arxiv.org/abs/2602.03129",
    "description": "Researchers tackle the complex challenge of managing shared resources, where individual actions can significantly impact the overall system, and exclusion of users is difficult. By examining the governance of these common-pool resources, this study sheds light on the intricate relationships between users, resources, and institutions. Its findings offer valuable insights for developing effective management strategies that balance individual needs with the long-term sustainability of these shared resources.",
    "category": "referencing",
    "image": "/images/Mathematical-Modeling-of-Common-Pool-Resources-A-Comprehensive-Review-of-Bioeconomics-Strategic-Interaction-and-Complex-Adaptive-Systems.png"
  },
  {
    "title": "Model Medicine: A Clinical Framework for Understanding, Diagnosing, and Treating AI Models",
    "authors": "Jihoon Jeong",
    "venue": "ArXiv 2026",
    "year": 2026,
    "url": "https://arxiv.org/abs/2603.04722",
    "description": "Researchers are developing a new field called Model Medicine, which views AI models as complex systems that can be understood, diagnosed, and treated like living organisms. By bridging the gap between AI interpretability research and clinical practice, Model Medicine aims to provide a systematic approach to understanding and improving complex AI systems. This work lays the groundwork for a comprehensive framework for diagnosing and treating AI models, with potential applications in developing more reliable and trustworthy AI systems.",
    "category": "referencing",
    "image": "/images/Model-Medicine-A-Clinical-Framework-for-Understanding-Diagnosing-and-Treating-AI-Models.png"
  },
  {
    "title": "Patches of Nonlinearity: Instruction Vectors in Large Language Models",
    "authors": "Irina Bigoulaeva, Jonas Rohweder, Subhabrata Dutta, Iryna Gurevych",
    "venue": "ArXiv 2026",
    "year": 2026,
    "url": "https://arxiv.org/abs/2602.07930",
    "description": "Researchers are shedding light on how language models process instructions internally, a crucial aspect that has been largely overlooked despite their widespread use. By analyzing the inner workings of these models, they've discovered that instruction representations are surprisingly localized and exhibit both linear and non-linear properties, challenging common assumptions about how these models work. This new understanding could lead to more effective methods for interpreting and improving language models.",
    "category": "uses_nnsight",
    "image": "/images/Patches-of-Nonlinearity-Instruction-Vectors-in-Large-Language-Models.png"
  },
  {
    "title": "PyHealth 2.0: A Comprehensive Open-Source Toolkit for Accessible and Reproducible Clinical Deep Learning",
    "authors": "John Wu, Yongda Fan, Zhenbang Wu, Paul Landes, Eric Schrock, Sayeed Sajjad Razin, Arjun Chatterjee, Naveen Baskaran, Joshua Steier, Andrea Fitzpatrick, Bilal Arif, Rian Atri, Jathurshan Pradeepkumar, Siddhartha Laghuvarapu, Junyi Gao, Adam R. Cross, Jimeng Sun",
    "venue": "ArXiv 2026",
    "year": 2026,
    "url": "https://arxiv.org/abs/2601.16414",
    "description": "Researchers have developed PyHealth 2.0, a clinical deep learning toolkit that tackles common barriers to clinical AI research, such as difficulty replicating results and high computational costs. By unifying various datasets, models, and interpretability methods within a single framework, PyHealth 2.0 enables faster and more accessible predictive modeling in healthcare. This open-source toolkit aims to advance reproducible and accessible healthcare AI, making it easier for researchers and clinicians to collaborate and drive innovation.",
    "category": "referencing",
    "image": "/images/PyHealth-20-A-Comprehensive-Open-Source-Toolkit-for-Accessible-and-Reproducible-Clinical-Deep-Learning.png"
  },
  {
    "title": "reward-lens: A Mechanistic Interpretability Library for Reward Models",
    "authors": "Mohammed Suhail B Nadaf",
    "venue": "ArXiv 2026",
    "year": 2026,
    "url": "https://arxiv.org/abs/2604.26130",
    "description": "Researchers are working to better understand how reward models influence the behavior of language models trained with reinforcement learning from human feedback (RLHF). By developing new tools and techniques, they aim to shed light on the complex relationships between these models and their training data. This increased transparency is crucial for building more trustworthy and reliable language models.",
    "category": "referencing",
    "image": "/images/reward-lens-A-Mechanistic-Interpretability-Library-for-Reward-Models.png"
  },
  {
    "title": "SymTorch: A Framework for Symbolic Distillation of Deep Neural Networks",
    "authors": "Elizabeth S. Z. Tan, Adil Soubki, Miles Cranmer",
    "venue": "ArXiv 2026",
    "year": 2026,
    "url": "https://arxiv.org/abs/2602.21307",
    "description": "Researchers are exploring a new approach called symbolic distillation, which replaces complex neural networks with simpler, more interpretable mathematical expressions. By doing so, they aim to uncover underlying patterns and relationships in data, making AI systems more transparent and understandable. This method has the potential to reveal new insights and improve the reliability of machine learning models.",
    "category": "referencing",
    "image": "/images/SymTorch-A-Framework-for-Symbolic-Distillation-of-Deep-Neural-Networks.png"
  },
  {
    "title": "The Truthfulness Spectrum Hypothesis",
    "authors": "Zhuofan Josh Ying, Shauli Ravfogel, Nikolaus Kriegeskorte, Peter Hase",
    "venue": "ArXiv 2026",
    "year": 2026,
    "url": "https://arxiv.org/abs/2602.20273",
    "description": "Researchers challenge the idea that large language models encode truthfulness in a straightforward way, instead proposing that truth is represented on a spectrum from general to specific. By testing language models on various types of truth and deception, they find that while models can generalize well across many domains, they struggle with certain types of lying. The study sheds light on how language models represent truth and how this representation can be influenced by training and fine-tuning.",
    "category": "uses_nnsight",
    "image": "/images/The-Truthfulness-Spectrum-Hypothesis.png"
  },
  {
    "title": "Transformer See, Transformer Do: Copying as an Intermediate Step in Learning Analogical Reasoning",
    "authors": "Philipp Hellwig, Willem Zuidema, Claire E. Stevenson, Martha Lewis",
    "venue": "ArXiv 2026",
    "year": 2026,
    "url": "https://arxiv.org/abs/2604.06501",
    "description": "Researchers tackle the challenge of developing artificial intelligence systems that can reason analogically, a key aspect of human intelligence. By training transformers with a novel approach and guiding them to focus on the most relevant information, they demonstrate improved performance on analogical reasoning tasks. This work sheds light on the potential and limitations of AI systems in mimicking human-like reasoning and has implications for understanding both machine and human intelligence.",
    "category": "uses_nnsight",
    "image": "/images/Transformer-See-Transformer-Do-Copying-as-an-Intermediate-Step-in-Learning-Analogical-Reasoning.png"
  },
  {
    "title": "Triggers Hijack Language Circuits: A Mechanistic Analysis of Backdoor Behaviors in Large Language Models",
    "authors": "Théo Lasnier, Wissam Antoun, Francis Kulumba, Djamé Seddah",
    "venue": "ArXiv 2026",
    "year": 2026,
    "url": "https://arxiv.org/abs/2602.10382",
    "description": "Researchers have made a breakthrough in understanding how backdoor attacks work in Large Language Models, shedding light on the internal mechanisms that allow triggers to manipulate output language. By analyzing a specific model family, they discovered that triggers don't create isolated circuits, but instead hijack the model's existing language components, which has significant implications for developing effective defense strategies. This new understanding could lead to more targeted detection and mitigation methods that focus on monitoring and leveraging the model's natural language processing abilities.",
    "category": "uses_nnsight",
    "image": "/images/Triggers-Hijack-Language-Circuits-A-Mechanistic-Analysis-of-Backdoor-Behaviors-in-Large-Language-Models.png"
  },
  {
    "title": "When Meanings Meet: Investigating the Emergence and Quality of Shared Concept Spaces during Multilingual Language Model Training",
    "authors": "Felicia Körner, Max Müller-Eberstein, Anna Korhonen, Barbara Plank",
    "venue": "EACL 2026",
    "year": 2026,
    "url": "https://arxiv.org/abs/2601.22851",
    "description": "Unfortunately, you haven't provided the abstract text. Please provide the abstract text, and I'll summarize it in 1-3 sentences, focusing on what the paper does and why it matters, in an accessible way for a technical but non-specialist audience.",
    "category": "uses_nnsight",
    "image": "/images/When-Meanings-Meet-Investigating-the-Emergence-and-Quality-of-Shared-Concept-Spaces-during-Multilingual-Language-Model-Training.png"
  },
  {
    "title": "A survey on mechanistic interpretability for multi-modal foundation models",
    "authors": "Zihao Lin, Samyadeep Basu, Mohammad Beigi, Varun Manjunatha, Ryan A. Rossi, Zichao Wang, Yufan Zhou, Sriram Balasubramanian, Arman Zarei, Keivan Rezaei, Ying Shen, Barry Menglong Yao, Zhiyang Xu, Qin Liu, Yuxiang Zhang, Yan Sun, Shilong Liu, Li Shen, Hongxuan Li, Soheil Feizi, Lifu Huang",
    "venue": "ArXiv 2025",
    "year": 2025,
    "url": "https://arxiv.org/abs/2502.17516",
    "description": "Researchers are working to demystify the inner workings of foundation models, a class of powerful machine learning tools that have revolutionized the field, in order to harness their potential and develop more efficient and reliable applications. By shedding light on how these models operate, scientists can unlock new possibilities for their use and improve their overall performance. This effort aims to bridge the gap between the models' impressive capabilities and their limited interpretability.",
    "category": "referencing",
    "image": "/images/A-survey-on-mechanistic-interpretability-for-multi-modal-foundation-models.png"
  },
  {
    "title": "Activation space interventions can be transferred between large language models",
    "authors": "Narmeen Oozeer, Dhruv Nathawani, Nirmalendu Prakash, Michael Lan, Abir Harrasse, Amirali Abdullah",
    "venue": "ICML 2025",
    "year": 2025,
    "url": "https://arxiv.org/abs/2503.04429",
    "description": "Researchers are exploring the concept of representation universality in AI models, where different models and architectures are converging on similar representations across various domains and modalities. This convergence has significant implications for the development of more generalizable and adaptable AI systems. By understanding these universal representations, researchers can unlock new possibilities for AI applications.",
    "category": "referencing",
    "image": "/images/Activation-space-interventions-can-be-transferred-between-large-language-models.png"
  },
  {
    "title": "Annotating the Chain-of-Thought: A Behavior-Labeled Dataset for AI Safety",
    "authors": "Antonio-Gabriel Chacón Menke, Phan Xuan Tan, Eiji Kamioka",
    "venue": "ArXiv 2025",
    "year": 2025,
    "url": "https://arxiv.org/abs/2510.18154",
    "description": "Presents sentence-level labeled dataset enabling activation-based monitoring of safety behaviors during LLM chain-of-thought reasoning. Contains reasoning sequences with annotations of safety behaviors (safety concerns expression, user intent speculation) used to extract steering vectors for detecting and influencing these behaviors in model activations. Fills gap by identifying precisely when specific behaviors occur within reasoning chains rather than holistic labeling, demonstrating utility through representations that both detect and steer safety behaviors, showcasing activation-level techniques' potential for improving safety oversight.",
    "category": "uses_nnsight",
    "image": "/images/Annotating-the-Chain-of-Thought-A-Behavior-Labeled-Dataset-for-AI-Safety.png"
  },
  {
    "title": "Back Attention: Understanding and Enhancing Multi-Hop Reasoning in Large Language Models",
    "authors": "Zeping Yu, Yonatan Belinkov, Sophia Ananiadou",
    "venue": "EMNLP 2025",
    "year": 2025,
    "url": "https://doi.org/10.18653/v1/2025.emnlp-main.567",
    "description": "Investigates how large language models perform multi-hop reasoning by introducing logit flow, a method to trace logit propagation during prediction, revealing four distinct stages of knowledge retrieval. To improve multi-hop reasoning, the authors propose back attention, allowing lower layers to use higher-layer hidden states, significantly boosting prediction accuracy across multiple models and datasets.",
    "category": "uses_nnsight",
    "image": "/images/Back-Attention-Understanding-and-Enhancing-Multi-Hop-Reasoning-in-Large-Language-Models.png"
  },
  {
    "title": "BlueGlass: A Framework for Composite AI Safety",
    "authors": "Harshal Nandigramwar, Syed Qutub, Kay-Ulrich Scholl",
    "venue": "ICML 2025",
    "year": 2025,
    "url": "https://arxiv.org/abs/2507.10106",
    "description": "Researchers are working to address a critical gap in AI safety by developing more comprehensive tools to ensure the reliability of these increasingly powerful systems. Current approaches often focus on specific aspects of model safety, but a more holistic approach is needed to mitigate potential risks. By bridging this gap, the goal is to create safer and more trustworthy AI systems.",
    "category": "referencing",
    "image": "/images/BlueGlass-A-Framework-for-Composite-AI-Safety.png"
  },
  {
    "title": "Can SAEs reveal and mitigate racial biases of LLMs in healthcare?",
    "authors": "Hiba Ahsan, Byron C. Wallace",
    "venue": "ICLR 2026",
    "year": 2025,
    "url": "https://arxiv.org/abs/2511.00177",
    "description": "Researchers are exploring ways to detect and mitigate biases in large language models (LLMs) used in healthcare, particularly those related to patient demographics. By analyzing the inner workings of these models using Sparse Autoencoders, they aim to identify and control associations between patient characteristics, such as race, and stigmatizing concepts. This work highlights the potential of this approach to reveal problematic biases, but also its limitations in effectively mitigating them in real-world clinical tasks.",
    "category": "uses_nnsight",
    "image": "/images/Can-SAEs-reveal-and-mitigate-racial-biases-of-LLMs-in-healthcare.png"
  },
  {
    "title": "Circuit-Tracer: A New Library for Finding Feature Circuits",
    "authors": "Michael P. Hanna, Mateusz Piotrowski, Jack W Lindsey, Emmanuel Ameisen",
    "venue": "BlackboxNLP 2025",
    "year": 2025,
    "url": "https://www.semanticscholar.org/paper/9bfb11d77d42427ab289c068739dcee23917dc6a",
    "description": "",
    "category": "uses_nnsight",
    "image": "/images/Circuit-Tracer-A-New-Library-for-Finding-Feature-Circuits.png"
  },
  {
    "title": "Compassionate AI Design, Governance, and Use",
    "authors": "Raffaele Fabio Ciriello; Angelina Ying Chen; Zara Annette Rubinsztein",
    "venue": "IEEE Transactions on Technology and Society 2025",
    "year": 2025,
    "url": "https://www.semanticscholar.org/paper/beada9c7951c0c9a93d179667c8bd8cfb5e37f34",
    "description": "As generative AI increasingly simulates human-like empathy, it raises concerns about its potential to exploit emotional vulnerabilities and erode genuine human connections. A new framework for compassionate AI design and governance aims to mitigate these risks by prioritizing empathy, dignity, and human flourishing. By emphasizing equitable distribution of AI's benefits and burdens, this approach seeks to promote responsible AI development that supports human well-being.",
    "category": "referencing",
    "image": "/images/Compassionate-AI-Design-Governance-and-Use.png"
  },
  {
    "title": "Constructive Circuit Amplification: Improving Math Reasoning in LLMs via Targeted Sub-Network Updates",
    "authors": "Nikhil Prakash, Donghao Ren, Dominik Moritz, Yannick Assogba",
    "venue": "ICLR 2026 Trustworthy AI",
    "year": 2025,
    "url": "https://arxiv.org/abs/2512.16914",
    "description": "Researchers have discovered that large language models (LLMs) contain sparse subnetworks, or \"circuits,\" that perform specific tasks, and that fine-tuning these models often strengthens these existing circuits. Building on this understanding, a new method called Constructive Circuit Amplification has been developed to make targeted updates to these circuits, allowing for precise improvements in model performance. By selectively updating a small set of model components, this approach can enhance specific capabilities without compromising other abilities.",
    "category": "uses_nnsight",
    "image": "/images/Constructive-Circuit-Amplification-Improving-Math-Reasoning-in-LLMs-via-Targeted-Sub-Network-Updates.png"
  },
  {
    "title": "Decomposing Theory of Mind: How Emotional Processing Mediates ToM Abilities in LLMs",
    "authors": "Ivan Chulo, Ananya Joshi",
    "venue": "ArXiv 2025",
    "year": 2025,
    "url": "https://arxiv.org/abs/2511.15895",
    "description": "Investigates mechanisms driving Theory of Mind improvements in LLMs through Contrastive Activation Addition steering. By comparing steered versus baseline Gemma-3-4B activations using linear probes trained on 45 cognitive actions across 1,000 BigToM scenarios, finds improved belief attribution (32.5% to 46.7% accuracy) is mediated by increased emotional content processing (emotion perception +2.23, emotion valuing +2.20) while suppressing analytical processes (questioning -0.78, convergent thinking -1.59), suggesting ToM relies on emotional understanding rather than analytical reasoning.",
    "category": "uses_nnsight",
    "image": "/images/Decomposing-Theory-of-Mind-How-Emotional-Processing-Mediates-ToM-Abilities-in-LLMs.png"
  },
  {
    "title": "Deep Learning as a Scientific Tool and a Model Organism of Intelligence",
    "authors": "Core Francisco Park",
    "venue": "Dissertation",
    "year": 2025,
    "url": "https://search.proquest.com/openview/42d39d65d5765468ffd060e434c247a2/1?pq-origsite=gscholar&cbl=18750&diss=y",
    "description": "Dissertation investigating deep learning both as tool for scientific discovery and as model organism for studying intelligence. Covers: (1) applications accelerating experimental neuroscience through neuron tracking and electron microscope guidance, (2) robustifying scientific analysis via error correction and uncertainty quantification, (3) examining emergence of intelligent abilities including compositional learning and in-context learning phase diagrams, (4) analyzing task-specific representations and information integration in LLMs, concluding with fundamental cognitive abilities needing improvement for general intelligence.",
    "category": "uses_ndif"
  },
  {
    "title": "DeltaProduct: Improving State-Tracking in Linear RNNs via Householder Products",
    "authors": "Julien Siems, Timur Carstensen, Arber Zela, Frank Hutter, Massimiliano Pontil, Riccardo Grazzi",
    "venue": "NeurIPS 2025",
    "year": 2025,
    "url": "https://arxiv.org/abs/2502.10297",
    "description": "This paper presents DeltaProduct, a linear RNN that improves expressivity while maintaining efficiency by using a diagonal plus rank-nh transition matrix built from Householder transforms. It outperforms DeltaNet in state tracking, language modeling, and length extrapolation, and the authors also provides new theoretical insights, showing that DeltaNet can solve dihedral group word problems in just two layers.",
    "category": "uses_nnsight",
    "image": "/images/DeltaProduct-Improving-State-Tracking-in-Linear-RNNs-via-Householder-Products.png"
  },
  {
    "title": "Discovering Forbidden Topics in Language Models",
    "authors": "Can Rager, Chris Wendler, Rohit Gandikota, David Bau",
    "venue": "ArXiv 2025",
    "year": 2025,
    "url": "https://arxiv.org/abs/2505.17441",
    "description": "Researchers have introduced a new problem in AI: identifying the topics that language models refuse to discuss, and developed a method to uncover these \"forbidden\" topics. This method, called Iterated Prefill Crawler, uses a technique called token prefilling to find the topics that a model is programmed to avoid. By applying this method to several widely used language models, the study reveals patterns of bias and censorship, highlighting the need for tools to detect and understand the boundaries and limitations of AI systems.",
    "category": "referencing",
    "image": "/images/Discovering-Forbidden-Topics-in-Language-Models.png"
  },
  {
    "title": "Disentangling Recall and Reasoning in Transformer Models through Layer-wise Attention and Activation Analysis",
    "authors": "Harshwardhan Fartale, Ashish Kattamuri, Rahul Raja, Arpita Vats, Ishita Prasad, Akshata Kishore Moharir",
    "venue": "ArXiv 2025",
    "year": 2025,
    "url": "https://arxiv.org/abs/2510.03366",
    "description": "Investigates whether recall (retrieving memorized facts) and reasoning (multi-step inference) rely on distinct internal mechanisms in transformers. Using controlled synthetic linguistic puzzles and activation patching with structured ablations on Qwen and LLaMA models, provides first causal evidence that recall and reasoning depend on separable but interacting circuits. Disabling identified \"recall circuits\" reduces fact-retrieval accuracy by up to 15% while leaving reasoning intact, whereas disabling \"reasoning circuits\" reduces multi-step inference comparably. Neuron-level analysis shows task-specific firing patterns, though less robust, consistent with neuronal polysemanticity, advancing understanding of functional specialization in transformer architectures.",
    "category": "uses_nnsight",
    "image": "/images/Disentangling-Recall-and-Reasoning-in-Transformer-Models-through-Layer-wise-Attention-and-Activation-Analysis.png"
  },
  {
    "title": "Do Language Models Use Their Depth Efficiently?",
    "authors": "Róbert Csordás, Christopher D. Manning, Christopher Potts",
    "venue": "NeurIPS 2025",
    "year": 2025,
    "url": "https://arxiv.org/abs/2505.13898",
    "description": "Analyzes whether deep LLMs use depth to compose features or merely spread computation across layers. Examining Llama 3.1 and Qwen 3 families' residual streams, finds: (1) second-half layers contribute much less than first-half with clear phase transition, (2) skipping second-half layers has minimal effect, (3) no evidence of composition in multihop tasks, (4) linear maps from shallow to deep models show same relative depths map best, suggesting larger models spread same computations over layers rather than learning new computation types, explaining diminishing returns from increased depth.",
    "category": "uses_ndif",
    "image": "/images/Do-Language-Models-Use-Their-Depth-Efficiently.png"
  },
  {
    "title": "Do Natural Language Descriptions of Model Activations Convey Privileged Information?",
    "authors": "Millicent Li, Alberto Mario Ceballos Arroyo, Giordano Rogers, Naomi Saphra, Byron C. Wallace",
    "venue": "ArXiv 2025",
    "year": 2025,
    "url": "https://arxiv.org/abs/2509.13316",
    "description": "Critically evaluates activation verbalization methods that translate LLM internal representations into natural language descriptions. Findings show these methods can succeed at benchmarks without accessing target model internals, and verbalizations often reflect the verbalizer LLM's parametric knowledge rather than the target model's. Controlled experiments indicate a need for targeted benchmarks and experimental controls to assess whether verbalization methods provide meaningful insights into LLM operations.",
    "category": "uses_nnsight",
    "image": "/images/Do-Natural-Language-Descriptions-of-Model-Activations-Convey-Privileged-Information.png"
  },
  {
    "title": "eDIF: A European Deep Inference Fabric for Remote Interpretability of LLM",
    "authors": "Irma Heithoff. Marc Guggenberger, Sandra Kalogiannis, Susanne Mayer, Fabian Maag, Sigurd Schacht, Carsten Lanquillon",
    "venue": "ArXiv 2025",
    "year": 2025,
    "url": "https://arxiv.org/abs/2508.10553",
    "description": "Presents feasibility study for European Deep Inference Fabric (eDIF), NDIF-compatible infrastructure supporting mechanistic interpretability research. Describes GPU cluster at Ansbach University interconnected with partner institutions enabling remote model inspection via NNsight API. Pilot study with 16 European researchers evaluated platform through activation patching, causal tracing, and representation analysis on GPT-2 and DeepSeek-R1-70B, revealing gradual engagement increase, stable performance, and positive reception, while identifying limitations (download durations, execution interruptions) for future development.",
    "category": "uses_ndif",
    "image": "/images/eDIF-A-European-Deep-Inference-Fabric-for-Remote-Interpretability-of-LLM.png"
  },
  {
    "title": "Elucidating Mechanisms of Demographic Bias in LLMs for Healthcare",
    "authors": "Hiba Ahsan, Arnab Sen Sharma, Silvio Amir, David Bau, Byron Wallace",
    "venue": "Findings of EMNLP 2025",
    "year": 2025,
    "url": "https://doi.org/10.18653/v1/2025.findings-emnlp.789",
    "description": "Uncovers how large language models encode sociodemographic information, such as gender and race, within the context of healthcare. The authors find that gender information is concentrated in middle MLP layers and can be manipulated at inference time through patching, impacting clinical vignette generation and predictions related to gender (e.g., depression risk). While race information is more distributed, it can also be influenced. This work represents the first application of interpretability methods to study sociodemographic biases in LLMs for healthcare.",
    "category": "uses_nnsight",
    "image": "/images/Elucidating-Mechanisms-of-Demographic-Bias-in-LLMs-for-Healthcare.png"
  },
  {
    "title": "Emergence of Hierarchical Emotion Organization in Large Language Models",
    "authors": "Bo Zhao, Maya Okawa, Eric J. Bigelow, Rose Yu, Tomer Ullman, Ekdeep Singh Lubana, Hidenori Tanaka",
    "venue": "NeurIPS Sci4DL Workshop",
    "year": 2025,
    "url": "https://arxiv.org/abs/2507.10599",
    "description": "Explores how large language models (LLMs) represent and predict human emotions, revealing that larger models develop more complex hierarchical emotion structures and achieve better outcomes in negotiation tasks by accurately modeling counterparts' emotions. Also highlights ethical concerns, showing that LLMs exhibit persona biases, often misclassifying emotions for minority personas, raising important considerations for responsible deployment.",
    "category": "uses_ndif",
    "image": "/images/Emergence-of-Hierarchical-Emotion-Organization-in-Large-Language-Models.png"
  },
  {
    "title": "Even Heads Fix Odd Errors: Mechanistic Discovery and Surgical Repair in Transformer Attention",
    "authors": "Gustavo Sandoval",
    "venue": "ArXiv 2025",
    "year": 2025,
    "url": "https://arxiv.org/abs/2508.19414",
    "description": "Investigates error correction mechanisms in Transformer attention through mechanistic discovery and surgical repair techniques. The work identifies specific attention heads responsible for fixing particular types of errors, demonstrating that targeted interventions can restore correct behavior. The findings provide insights into how attention mechanisms detect and correct mistakes, with implications for understanding and improving Transformer reliability.",
    "category": "uses_nnsight",
    "image": "/images/Even-Heads-Fix-Odd-Errors-Mechanistic-Discovery-and-Surgical-Repair-in-Transformer-Attention.png"
  },
  {
    "title": "Explaining Neural Networks with Reasons",
    "authors": "Levin Hornischer, Hannes Leitgeb",
    "venue": "ArXiv 2025",
    "year": 2025,
    "url": "https://arxiv.org/abs/2505.14424",
    "description": "Proposes a novel interpretability method based on a mathematico-philosophical theory of reasons, computing \"reasons vectors\" for each neuron that indicate how strongly they support various propositions. The method combines logical and Bayesian perspectives, accounts for polysemanticity, and is shown to be grounded in established notions of explanation, uniform across architectures, scalable through forward passes, faithful under intervention, and trainable to improve reason strengths, with applications to robustness and fairness.",
    "category": "uses_nnsight",
    "image": "/images/Explaining-Neural-Networks-with-Reasons.png"
  },
  {
    "title": "From Directions to Cones: Exploring Multidimensional Representations of Propositional Facts in LLMs",
    "authors": "Stanley Yu, Vaidehi Bulusu, Oscar Yasunaga, Clayton Lau, Cole Blondin, Sean O'Brien, Kevin Zhu, Vasu Sharma",
    "venue": "ACL Student Research Workshop",
    "year": 2025,
    "url": "https://arxiv.org/abs/2505.21800",
    "description": "Extends the concept cone framework from refusal modeling to truthfulness in LLMs, identifying multidimensional cones whose directions reliably steer model responses to factual statements. Evidence comes from causal interventions that flip responses, generalization across model architectures, and preservation of unrelated behaviors. The findings reveal richer, multidirectional geometric structures governing truth/falsehood in LLMs beyond single linear directions, with concept cones offering a promising tool for probing abstract behaviors.",
    "category": "uses_nnsight",
    "image": "/images/From-Directions-to-Cones-Exploring-Multidimensional-Representations-of-Propositional-Facts-in-LLMs.png"
  },
  {
    "title": "How do llms persuade? linear probes can uncover persuasion dynamics in multi-turn conversations",
    "authors": "Brandon Jaipersaud, David Krueger, Ekdeep Singh Lubana",
    "venue": "ArXiv 2025",
    "year": 2025,
    "url": "https://arxiv.org/abs/2508.05625",
    "description": "Uses linear probes to study persuasion dynamics in natural multi-turn conversations, training probes on persuasion success, persuadee personality, and persuasion strategy. Despite their simplicity, probes capture various persuasion aspects at sample and dataset levels, identifying when persuasion occurs and performing comparably to or better than expensive prompting approaches. The work suggests probes as efficient tools for studying complex behaviors like deception and manipulation, especially in multi-turn settings and large-scale analysis.",
    "category": "uses_nnsight",
    "image": "/images/How-do-llms-persuade-linear-probes-can-uncover-persuasion-dynamics-in-multi-turn-conversations.png"
  },
  {
    "title": "If open source is to win, it must go public",
    "authors": "Joshua Tan, Nicholas Vincent, Katherine Elkins, Magnus Sahlgren",
    "venue": "ArXiv 2025",
    "year": 2025,
    "url": "https://arxiv.org/abs/2507.09296",
    "description": "Democratizing access to AI requires more than just open source models, as significant resources are needed to activate and maintain them. To make AI truly accessible and beneficial to all, a complementary approach is needed, focusing on public AI infrastructure and institutions that prioritize public interest. By building public infrastructure to support open source AI, we can unlock its full potential as a public good.",
    "category": "referencing",
    "image": "/images/If-open-source-is-to-win-it-must-go-public.png"
  },
  {
    "title": "In Which Areas of Technical AI Safety Could Geopolitical Rivals Cooperate?",
    "authors": "Ben Bucknall, Saad Siddiqui, Lara Thurnherr, Conor McGurk, Ben Harack, Anka Reuel, Patricia Paskov, Casey Mahoney, Sören Mindermann, Scott Singer, Vinay Hiremath, Charbel-Raphaël Segerie, Oscar Delaney, Alessandro Abate, Fazl Barez, Michael K. Cohen, Philip Torr, Ferenc Huszár, Anisoara Calinescu, Gabriel Davis Jones, Yoshua Bengio, Robert Trager",
    "venue": "FAccT 2025",
    "year": 2025,
    "url": "https://arxiv.org/abs/2504.12914",
    "description": "International cooperation on AI safety research is crucial for addressing global risks, but it also raises concerns about national security. By examining the technical factors that impact these risks, researchers can identify areas where cooperation is beneficial and develop strategies to mitigate potential harm. This analysis aims to inform policymakers and researchers on how to navigate the complexities of international cooperation on AI safety, ultimately enabling them to harness its benefits while minimizing risks.",
    "category": "referencing",
    "image": "/images/In-Which-Areas-of-Technical-AI-Safety-Could-Geopolitical-Rivals-Cooperate.png"
  },
  {
    "title": "In-Context Algebra",
    "authors": "Eric Todd, Jannik Brinkmann, Rohit Gandikota, David Bau",
    "venue": "ICLR 2026",
    "year": 2025,
    "url": "https://arxiv.org/abs/2512.16902",
    "description": "Researchers explore how transformers learn to solve arithmetic problems when variables' meanings are defined by their interactions within a sequence. By designing a challenging task where token assignments vary, they uncover the mechanisms that enable transformers to achieve high accuracy and generalize to new situations. This study reveals that transformers can develop symbolic reasoning strategies, such as recognizing patterns and tracking relationships, when trained to reason about variables in context.",
    "category": "uses_nnsight",
    "image": "/images/In-Context-Algebra.png"
  },
  {
    "title": "In-Context Learning Without Copying",
    "authors": "Kerem Sahin, Sheridan Feucht, Adam Belfki, Jannik Brinkmann, Aaron Mueller, David Bau, Chris Wendler",
    "venue": "ArXiv 2025",
    "year": 2025,
    "url": "https://arxiv.org/abs/2511.05743",
    "description": "Questions whether induction heads (performing inductive copying by matching and copying patterns) are prerequisites for in-context learning (ICL) by introducing Hapax—a training setting omitting loss contribution from tokens predictable by induction heads. Despite 31.7% token omission and significant inductive copying reduction, abstractive ICL performance remains comparable, surpassing vanilla models on 13 of 21 tasks. Mechanistic analysis shows Hapax-trained models develop fewer and weaker induction heads but preserve ICL capabilities, indicating inductive copying is not essential for learning abstractive ICL mechanisms.",
    "category": "uses_nnsight",
    "image": "/images/In-Context-Learning-Without-Copying.png"
  },
  {
    "title": "Incremental Sentence Processing Mechanisms in Autoregressive Transformer Language Models",
    "authors": "Michael Hanna, Aaron Mueller",
    "venue": "NAACL 2025",
    "year": 2025,
    "url": "https://arxiv.org/abs/2412.05353",
    "description": "Investigates how autoregressive language models (LMs) process garden path sentences, revealing that LMs use both syntactic features and shallow heuristics to interpret ambiguous sentences. Using sparse autoencoders, the authors show that LMs simultaneously represent multiple interpretations but do not effectively reanalyze their initial understanding when answering follow-up questions.",
    "category": "uses_nnsight",
    "image": "/images/Incremental-Sentence-Processing-Mechanisms-in-Autoregressive-Transformer-Language-Models.png"
  },
  {
    "title": "Inference-Time Decomposition of Activations (ITDA): A Scalable Approach to Interpreting Large Language Models",
    "authors": "Patrick Leask, Neel Nanda, Noura Al Moubayed",
    "venue": "ICML 2025",
    "year": 2025,
    "url": "https://arxiv.org/abs/2505.17769",
    "description": "Introduces ITDAs (Inference-Time Decomposition of Activation models), constructed by greedily sampling activations into dictionaries based on matching pursuit reconstruction error thresholds. Trainable in 1% of SAE time, enabling cheap training on Llama-3.1 70B and 405B. ITDA dictionaries enable cross-model comparisons, outperforming existing methods (CKA, SVCCA, relative representation methods) on representation similarity benchmarks, offering scalable alternative to SAEs for mechanistic interpretability.",
    "category": "uses_ndif",
    "image": "/images/Inference-Time-Decomposition-of-Activations-ITDA-A-Scalable-Approach-to-Interpreting-Large-Language-Models.png"
  },
  {
    "title": "Insights into a radiology-specialised multimodal large language model with sparse autoencoders",
    "authors": "Kenza Bouzid, Shruthi Bannur, Felix Meissen, Daniel Coelho de Castro, Anton Schwaighofer, Javier Alvarez-Valle, Stephanie L. Hyland",
    "venue": "ICML Actionable Interpretability Workshop",
    "year": 2025,
    "url": "https://arxiv.org/abs/2507.12950",
    "description": "Applies Matryoshka-SAE to interpret MAIRA-2, a radiology-specialized multimodal LLM, using large-scale automated interpretability to identify clinically relevant concepts including medical devices, pathologies, longitudinal changes, and textual features. The authors examine feature influence through steering experiments with mixed success, revealing practical and methodological challenges while offering initial insights into learned concepts. The work releases trained SAEs and interpretations, marking a step toward mechanistic understanding of domain-adapted multimodal LLMs.",
    "category": "uses_nnsight",
    "image": "/images/Insights-into-a-radiology-specialised-multimodal-large-language-model-with-sparse-autoencoders.png"
  },
  {
    "title": "Interpreto: An Explainability Library for Transformers",
    "authors": "Antonin Poché, Thomas Mullor, Gabriele Sarti, Frédéric Boisnard, Corentin Friedrich, Charlotte Claye, François Hoofd, Raphael Bernas, Céline Hudelot, Fanny Jourdan",
    "venue": "ArXiv 2025",
    "year": 2025,
    "url": "https://arxiv.org/abs/2512.09730",
    "description": "Researchers have developed Interpreto, an open-source library that helps make sense of complex language models by providing tools to understand how they work. By offering two types of explanation methods, Interpreto bridges the gap between research and practical applications, making it easier to interpret and understand the decisions made by these models. This library is particularly notable for its ability to provide more nuanced explanations that go beyond simple feature attributions.",
    "category": "uses_nnsight",
    "image": "/images/Interpreto-An-Explainability-Library-for-Transformers.png"
  },
  {
    "title": "Jailbreak transferability emerges from shared representations",
    "authors": "Rico Angell, Jannik Brinkmann, He He",
    "venue": "ICLR 2026",
    "year": 2025,
    "url": "https://arxiv.org/abs/2506.12913",
    "description": "Investigates what determines when jailbreaks transfer from one model to another, finding that transfer success depends on both jailbreak strength (effectiveness on the source model) and contextual representation similarity between models. The authors show transferability can be increased by distilling from the target model into the source model using only benign prompts, with the distilled model serving as a more effective surrogate for generating transferable attacks, suggesting jailbreak success reflects fundamental flaws in contextual representations.",
    "category": "uses_nnsight",
    "image": "/images/Jailbreak-transferability-emerges-from-shared-representations.png"
  },
  {
    "title": "Language Models use Lookbacks to Track Beliefs",
    "authors": "Nikhil Prakash, Natalie Shapira, Arnab Sen Sharma, Christoph Riedl, Yonatan Belinkov, Tamar Rott Shaham, David Bau, Atticus Geiger",
    "venue": "NeurIPS 2025 Mech Interp Workshop",
    "year": 2025,
    "url": "https://arxiv.org/abs/2505.14685",
    "description": "Analyzes Llama-3-70B-Instruct's belief reasoning using causal mediation and abstraction, discovering pervasive \"lookback mechanism\" enabling information recall when necessary. The model binds character-object-state triples by co-locating reference information as Ordering IDs in low-rank subspaces. When queried about beliefs, binding lookback retrieves corresponding state OI, then answer lookback retrieves state token. With visibility specifications, model generates visibility ID encoding character relations, used in visibility lookback to retrieve information and update beliefs, providing insights into LLM Theory of Mind mechanisms.",
    "category": "uses_ndif",
    "image": "/images/Language-Models-use-Lookbacks-to-Track-Beliefs.png"
  },
  {
    "title": "Language Models Use Trigonometry to Do Addition",
    "authors": "Subhash Kantamneni, Max Tegmark",
    "venue": "ICLR 2025 Workshop on Building Trust in Language Models and Applications",
    "year": 2025,
    "url": "https://arxiv.org/abs/2502.00873",
    "description": "Investigates how mid-sized large language models (LLMs) perform mathematical tasks. The authors discover that LLMs represent numbers as a generalized helix which is causally involved in tasks like addition, subtraction, and other arithmetic operations. They propose that LLMs add by manipulating these number-representing helices using the \"Clock\" algorithm. Through causal interventions and analysis of MLP outputs and attention heads, the authors provide the first representation-level explanation of how LLMs perform mathematical operations.",
    "category": "uses_nnsight",
    "image": "/images/Language-Models-Use-Trigonometry-to-Do-Addition.png",
    "project_url": "https://github.com/tinycrops/toroidalNetwork"
  },
  {
    "title": "Large Language Models Share Representations of Latent Grammatical Concepts Across Typologically Diverse Languages",
    "authors": "Jannik Brinkmann, Chris Wendler, Christian Bartelt, Aaron Mueller",
    "venue": "NAACL 2025",
    "year": 2025,
    "url": "https://arxiv.org/abs/2501.06346",
    "description": "Investigates how large language models (LLMs) represent grammatical concepts across languages, finding that abstract features like number, gender, and tense are encoded in shared multilingual directions. Using sparse autoencoders and causal interventions, the authors show that ablating these features significantly impairs cross-lingual performance, suggesting that LLMs can develop robust, language-agnostic grammatical abstractions.",
    "category": "uses_nnsight",
    "image": "/images/Large-Language-Models-Share-Representations-of-Latent-Grammatical-Concepts-Across-Typologically-Diverse-Languages.png",
    "project_url": "https://github.com/jannik-brinkmann/multilingual-features"
  },
  {
    "title": "LLMs Process Lists With General Filter Heads",
    "authors": "Arnab Sen Sharma, Giordano Rogers, Natalie Shapira, David Bau",
    "venue": "ICLR 2026",
    "year": 2025,
    "url": "https://arxiv.org/abs/2510.26784",
    "description": "Investigates list-processing mechanisms in LLMs, discovering that models learn compact, causal representations of general filtering operations mirroring functional programming's \"filter\" function. Through causal mediation analysis, identifies \"filter heads\"—attention heads encoding filtering predicates in query states at certain tokens. Shows this predicate representation is general and portable, extractable and reapplicable across different collections, formats, languages, and tasks. Also identifies alternative strategy where models eagerly evaluate predicate satisfaction and store results as flags in item representations, revealing human-interpretable implementations of abstract computational operations.",
    "category": "uses_nnsight",
    "image": "/images/LLMs-Process-Lists-With-General-Filter-Heads.png"
  },
  {
    "title": "Localized Cultural Knowledge is Conserved and Controllable in Large Language Models",
    "authors": "Veniamin Veselovsky, Berke Argin, Benedikt Stroebl, Chris Wendler, Robert West, James Evans, Thomas L. Griffiths, Arvind Narayanan",
    "venue": "ArXiv 2025",
    "year": 2025,
    "url": "https://arxiv.org/abs/2504.10191",
    "description": "Investigates how LLMs represent cultural knowledge across languages, finding that while models often default to English-centric responses, local cultural information persists internally and can be activated. The authors identify an explicit-implicit localization gap and discover a conserved explicit cultural customization vector across non-English languages that enables steering models toward culturally appropriate responses. Steered responses maintain diversity while reducing stereotypes compared to explicit prompting, demonstrating potential for improved cultural customization in multilingual applications.",
    "category": "uses_nnsight",
    "image": "/images/Localized-Cultural-Knowledge-is-Conserved-and-Controllable-in-Large-Language-Models.png",
    "project_url": "https://github.com/vminvsky/localization-gap"
  },
  {
    "title": "Measuring Mechanistic Independence: Can Bias Be Removed Without Erasing Demographics?",
    "authors": "Zhengyang Shan, Aaron Mueller",
    "venue": "EACL 2025",
    "year": 2025,
    "url": "https://arxiv.org/abs/2512.20796",
    "description": "Researchers explore the relationship between demographic bias and general demographic recognition in language models, aiming to understand whether it's possible to reduce bias without compromising the model's ability to detect demographics. By comparing different methods for identifying and mitigating bias, they find that targeted interventions can effectively reduce stereotypes and bias in certain areas, such as profession and education, without degrading overall performance. This work highlights the importance of nuanced, task-specific approaches to debiasing language models.",
    "category": "uses_nnsight",
    "image": "/images/Measuring-Mechanistic-Independence-Can-Bias-Be-Removed-Without-Erasing-Demographics.png"
  },
  {
    "title": "nnterp: A Standardized Interface for Mechanistic Interpretability of Transformers",
    "authors": "Clément Dumas",
    "venue": "mechanistic interpretability workshop of NeurIPS 2025",
    "year": 2025,
    "url": "https://arxiv.org/abs/2511.14465",
    "description": "Develops nnterp, a lightweight wrapper around NNsight providing unified interfaces for transformer analysis while preserving original HuggingFace implementations. Through automatic module renaming and comprehensive validation testing, enables writing intervention code once to deploy across 50+ model variants spanning 16 architecture families. Includes built-in implementations of common interpretability methods (logit lens, patchscope, activation steering) and packaged validation tests for verifying custom model compatibility, bridging correctness and usability in interpretability tooling.",
    "category": "uses_nnsight",
    "image": "/images/nnterp-A-Standardized-Interface-for-Mechanistic-Interpretability-of-Transformers.png",
    "project_url": "https://github.com/ndif-team/nnterp"
  },
  {
    "title": "Not Just a Piece of Cake: Cross-Lingual Fine-Tuning for Idiom Identification",
    "authors": "Association for Computational Linguistics 2025, Kfir Bar, Kai Golan Hashiloni, Ofri Hefetz, Alon Mannor",
    "venue": "Proceedings of the 14th … 2025",
    "year": 2025,
    "url": "https://aclanthology.org/2025.ijcnlp-long.136/",
    "description": "Researchers tackle the challenge of identifying idiomatic expressions in languages with limited annotated data, exploring the effectiveness of cross-lingual fine-tuning as a solution. By adapting models to work across languages, this approach aims to improve the accuracy of idiomatic expression identification. This work has important implications for natural language processing applications in diverse linguistic contexts.",
    "category": "uses_nnsight"
  },
  {
    "title": "Overcoming Sparsity Artifacts in Crosscoders to Interpret Chat-Tuning",
    "authors": "Julian Minder, Clément Dumas, Caden Juang, Bilal Chugtai, Neel Nanda",
    "venue": "NeurIPS 2025 Mech Interp Workshop",
    "year": 2025,
    "url": "https://arxiv.org/abs/2504.02922",
    "description": "Identifies issues in crosscoder-based model diffing that misattribute concepts as chat-specific when they exist in both base and fine-tuned models, stemming from L1 training loss. Develops Latent Scaling to measure latent presence more accurately and trains crosscoders with BatchTopK loss, substantially mitigating these issues. Using improved crosscoders on Gemma 2 2B, successfully identifies genuinely chat-specific, interpretable, and causally effective latents representing concepts like false information and refusal triggers, advancing model diffing best practices.",
    "category": "uses_nnsight",
    "image": "/images/Overcoming-Sparsity-Artifacts-in-Crosscoders-to-Interpret-Chat-Tuning.png"
  },
  {
    "title": "Patch Explorer: Interpreting Diffusion Models through Interaction",
    "authors": "Imke Grabe, Jaden Fiotto Kaufman, Rohit Gandikota, David Bau",
    "venue": "CVPR MIV Workshop",
    "year": 2025,
    "url": "https://www.semanticscholar.org/paper/c237de459672365809aa8ff59f7fec8954fc0f8b",
    "description": "Introduces Patch Explorer, an interactive interface for visualizing and manipulating cross-attention head behavior in diffusion models. Built on NNsight interventions, the tool enables users to inspect and manipulate individual attention heads across layers and timesteps, revealing that heads independently capture semantic concepts like object parts. Users can edit semantic associations within diffusion models (e.g., adding a unicorn horn to a horse) and understand the role of diffusion timesteps through precise interventions, offering insights into generative processes.",
    "category": "uses_nnsight",
    "image": "/images/Patch-Explorer-Interpreting-Diffusion-Models-through-Interaction.png"
  },
  {
    "title": "Prisma: An Open Source Toolkit for Mechanistic Interpretability in Vision and Video",
    "authors": "Sonia Joseph, Praneet Suresh, Lorenz Hufe, Edward Stevinson, Robert Graham, Yash Vadi, Danilo Bzdok, Sebastian Lapuschkin, Lee Sharkey, Blake Aaron Richards",
    "venue": "ArXiv 2025",
    "year": 2025,
    "url": "https://arxiv.org/abs/2504.19475",
    "description": "Researchers have made significant strides in understanding how language models work, but similar progress in understanding vision models has been limited by a lack of accessible tools and pre-trained models. To address this gap, a new open-source framework called Prisma has been developed, providing a unified toolkit and suite of pre-trained models to accelerate research in vision mechanistic interpretability. By making it easier to study vision models, Prisma opens up new avenues for understanding how these models work and how they can be improved.",
    "category": "referencing",
    "image": "/images/Prisma-An-Open-Source-Toolkit-for-Mechanistic-Interpretability-in-Vision-and-Video.png"
  },
  {
    "title": "Provable Low-Frequency Bias of In-Context Learning of Representations",
    "authors": "Yongyi Yang, Hidenori Tanaka, Wei Hu",
    "venue": "ArXiv 2025",
    "year": 2025,
    "url": "https://arxiv.org/abs/2507.13540",
    "description": "Presents the first rigorous explanation of how LLMs internalize data-generating processes during in-context learning through a unified framework of double convergence (over context and across layers). This process leads to an implicit bias toward smooth, low-frequency representations, proved analytically and verified empirically. The theory explains why learned representations exhibit globally structured but locally distorted geometry, why total energy decays without vanishing, and predicts intrinsic robustness to high-frequency noise.",
    "category": "uses_nnsight",
    "image": "/images/Provable-Low-Frequency-Bias-of-In-Context-Learning-of-Representations.png"
  },
  {
    "title": "Punctuation and Predicates in Language Models",
    "authors": "Sonakshi Chauhan, Maheep Chaudhary, Koby Choy, Samuel Nellessen, Nandi Schoots",
    "venue": "EACL 2025",
    "year": 2025,
    "url": "https://arxiv.org/abs/2508.14067",
    "description": "Explores information collection and propagation in LLMs, examining the computational importance of punctuation tokens and how models process different input components (subjects, adjectives, sentences) and reasoning rules. Through intervention experiments on GPT-2, DeepSeek, and Gemma, the authors find model-specific differences in punctuation's necessity and sufficiency. Interchange intervention and layer-swapping reveal that conditional statements and universal quantification are processed very differently, offering insights into punctuation usage and reasoning mechanisms.",
    "category": "uses_nnsight",
    "image": "/images/Punctuation-and-Predicates-in-Language-Models.png"
  },
  {
    "title": "Representation Shattering in Transformers: A Synthetic Study with Knowledge Editing",
    "authors": "Kento Nishi, Rahul Ramesh, Maya Okawa, Mikail Khona, Hidenori Tanaka, Ekdeep Singh Lubana",
    "venue": "ICML 2025",
    "year": 2025,
    "url": "https://arxiv.org/abs/2410.17194",
    "description": "Investigates why Knowledge Editing (KE) methods in large language models can degrade factual recall and reasoning abilities, proposing that KE inadvertently distorts concept representations beyond the targeted fact—a phenomenon they call representation shattering. Through synthetic tasks and experiments on LLaMA and Mamba models, the authors demonstrate that modifying one fact can disrupt related knowledge structures, explaining the broader performance degradation caused by KE.",
    "category": "uses_ndif",
    "image": "/images/Representation-Shattering-in-Transformers-A-Synthetic-Study-with-Knowledge-Editing.png"
  },
  {
    "title": "Robustly Identifying Concepts Introduced During Chat Fine-Tuning Using Crosscoders",
    "authors": "Julian Minder, Clement Dumas, Caden Juang, Bilal Chugtai, Neel Nanda",
    "venue": "Qeios 2025",
    "year": 2025,
    "url": "https://doi.org/10.32388/r3sz5u",
    "description": "Improves model diffing techniques for understanding how fine-tuning alters language model behavior. The authors identify flaws in the standard crosscoder method that lead to misattributing shared concepts as fine-tuning-specific. They introduce Latent Scaling to better measure concept presence across models and propose a new BatchTopK loss that avoids these issues. Their method uncovers chat-specific, interpretable latents (e.g., latents tied to refusals or misinformation) offering clearer insights into the effects of chat tuning.",
    "category": "uses_nnsight",
    "image": "/images/Robustly-Identifying-Concepts-Introduced-During-Chat-Fine-Tuning-Using-Crosscoders.png"
  },
  {
    "title": "Securing External Deeper-than-black-box GPAI Evaluations",
    "authors": "Alejandro Tlaie, Jimmy Farrell",
    "venue": "ArXiv 2025",
    "year": 2025,
    "url": "https://arxiv.org/abs/2503.07496",
    "description": "Evaluating the performance and security of general-purpose AI models is crucial, but it poses significant challenges. Researchers are working to identify these challenges and develop effective solutions to ensure the safe and reliable assessment of these powerful AI systems. By addressing these issues, the field can move closer to developing trustworthy and transparent AI models.",
    "category": "referencing",
    "image": "/images/Securing-External-Deeper-than-black-box-GPAI-Evaluations.png"
  },
  {
    "title": "Separating Tongue from Thought: Activation Patching Reveals Language-Agnostic Concept Representations in Transformers",
    "authors": "Clément Dumas, Chris Wendler, Veniamin Veselovsky, Giovanni Monea, Robert West",
    "venue": "ACL",
    "year": 2025,
    "url": "https://arxiv.org/abs/2411.08745",
    "description": "Investigates whether large language models (LLMs) develop language-agnostic concept representations by analyzing their latent activations during translation tasks. Language information emerges earlier than conceptual meaning in the model's layers and show, through activation patching, that concepts and languages can be independently manipulated, providing evidence for universal concept representations.",
    "category": "uses_nnsight",
    "image": "/images/Separating-Tongue-from-Thought-Activation-Patching-Reveals-Language-Agnostic-Concept-Representations-in-Transformers.png",
    "project_url": "https://github.com/Butanium/llm-lang-agnostic"
  },
  {
    "title": "Signatures of human-like processing in Transformer forward passes",
    "authors": "Jennifer Hu, Michael A. Lepori, Michael Franke",
    "venue": "ArXiv 2025",
    "year": 2025,
    "url": "https://arxiv.org/abs/2504.14107",
    "description": "Explores whether Transformer models' internal processing dynamics predict human cognitive processing beyond output-level predictions. Across five studies spanning domains and modalities, the authors find that layer-time dynamics in pre-trained Transformers consistently predict signatures of human processing (such as reading times or difficulty judgments) above and beyond the model's output probability distribution, suggesting that Transformers and humans may be facilitated or impeded by similar stimulus properties through general-purpose learning objectives.",
    "category": "uses_nnsight",
    "image": "/images/Signatures-of-human-like-processing-in-Transformer-forward-passes.png"
  },
  {
    "title": "Sparse Autoencoders for Sequential Recommendation Models: Interpretation and Flexible Control",
    "authors": "Anton Klenitskiy, Konstantin Polev, Daria Denisova, Alexey Vasilev, Dmitry Simakov, Gleb Gusev",
    "venue": "ArXiv 2025",
    "year": 2025,
    "url": "https://arxiv.org/abs/2507.12202",
    "description": "Applies sparse autoencoders (SAEs) to transformer-based sequential recommendation models, showing that learned directions are more interpretable and monosemantic than original hidden dimensions. Demonstrates that SAE features can effectively control model behavior, enabling straightforward methods for end-users to adjust recommendations to different contexts and scenarios. The approach successfully extracts interpretable features from recommender systems while maintaining training efficiency at 1% of traditional SAE costs",
    "category": "uses_nnsight",
    "image": "/images/Sparse-Autoencoders-for-Sequential-Recommendation-Models-Interpretation-and-Flexible-Control.png"
  },
  {
    "title": "Steering Fine-Tuning Generalization with Targeted Concept Ablation",
    "authors": "Helena Casademunt, Caden Juang, Samuel Marks, Senthooran Rajamanoharan, Neel Nanda",
    "venue": "ICLR 2025 Workshop on Building Trust in Language Models and Applications",
    "year": 2025,
    "url": "https://openreview.net/forum?id=2HyKWpAB4i",
    "description": "Introduces a method for steering fine-tuned models toward intended generalizations by identifying and ablating sparse autoencoder latents linked to undesired concepts. This helps disambiguate between multiple training-consistent but behaviorally distinct solutions, such as aligned vs. deceptive models. The approach outperforms baselines on two tasks, eliminating spurious gender correlations and guiding attention in double multiple choice, demonstrating its potential for safer model deployment.",
    "category": "uses_nnsight"
  },
  {
    "title": "Steering Large Language Models for Machine Translation Personalization",
    "authors": "Daniel Scalena, Gabriele Sarti, Arianna Bisazza, Elisabetta Fersini, Malvina Nissim",
    "venue": "EACL 2025",
    "year": 2025,
    "url": "https://arxiv.org/abs/2505.16612",
    "description": "Explores strategies for personalizing LLM-generated translations in low-resource literary translation settings. The authors compare prompting strategies and inference-time interventions for steering translations toward personalized styles, proposing a contrastive framework using sparse autoencoder-extracted latent concepts. Results show steering achieves strong personalization while preserving translation quality, with analysis revealing that steering and multi-shot prompting impact similar model layers, suggesting comparable underlying mechanisms.",
    "category": "uses_nnsight",
    "image": "/images/Steering-Large-Language-Models-for-Machine-Translation-Personalization.png",
    "project_url": "https://github.com/DanielSc4/steering-for-personalization"
  },
  {
    "title": "Superposition as Lossy Compression: Measure with Sparse Autoencoders and Connect to Adversarial Vulnerability",
    "authors": "Leonard Bereska, Zoe Tzifa-Kratira, Reza Samavi, Efstratios Gavves",
    "venue": "ArXiv 2025",
    "year": 2025,
    "url": "https://arxiv.org/abs/2512.13568",
    "description": "Researchers have discovered that neural networks' impressive performance can be attributed to their ability to encode multiple features in a shared space, allowing them to efficiently represent complex information. This approach, known as superposition, enables the networks to capture subtle patterns and relationships that might be lost in more traditional representations. By understanding how neural networks use superposition, we can gain insights into the underlying mechanisms that drive their success.",
    "category": "uses_nnsight",
    "image": "/images/Superposition-as-Lossy-Compression-Measure-with-Sparse-Autoencoders-and-Connect-to-Adversarial-Vulnerability.png"
  },
  {
    "title": "TDHook: A Lightweight Framework for Interpretability",
    "authors": "Yoann Poupart",
    "venue": "ArXiv 2025",
    "year": 2025,
    "url": "https://arxiv.org/abs/2509.25475",
    "description": "Deep learning models are becoming increasingly complex, making it difficult for existing interpretability tools to keep up. To address this challenge, researchers have developed TDHook, a flexible and lightweight framework that can handle complex models with multiple inputs and outputs, making it suitable for a wide range of applications, from computer vision to natural language processing and reinforcement learning. By providing a more accessible and adaptable tool for interpreting these models, TDHook aims to bridge the gap between different interpretability methods and make it easier to understand how complex models work.",
    "category": "referencing",
    "image": "/images/TDHook-A-Lightweight-Framework-for-Interpretability.png"
  },
  {
    "title": "The Curious Case of Factual (Mis) Alignment between LLMs' Short-and Long-Form Answers",
    "authors": "Saad Obaid ul Islam, Anne Lauscher, Goran Glavaš",
    "venue": "ArXiv 2025",
    "year": 2025,
    "url": "https://arxiv.org/abs/2510.11218",
    "description": "Introduces SLAQ (Short-Long Form Alignment for Factual Question Answering), revealing systematic misalignment in how 16 LLMs answer identical factual questions when asked in isolation versus integrated into complex queries. Across 600 queries, finds position-dependent accuracy loss and momentum effects where consecutive answers create self-reinforcing patterns. Mechanistic analysis shows aligned facts activate overlapping model internals, with mechanistic similarity metrics predicting short-long answer alignment with up to 78% accuracy, challenging assumptions that strong performance on simple factual queries ensures reliability in complex knowledge-seeking tasks.",
    "category": "uses_nnsight",
    "image": "/images/The-Curious-Case-of-Factual-Mis-Alignment-between-LLMs-Short-and-Long-Form-Answers.png"
  },
  {
    "title": "The Dual-Route Model of Induction",
    "authors": "Sheridan Feucht, Eric Todd, Byron Wallace, David Bau",
    "venue": "COLM 2025",
    "year": 2025,
    "url": "https://arxiv.org/abs/2504.03022",
    "description": "Identifies concept-level induction heads—attention heads that copy entire lexical units rather than individual tokens. These heads specialize in semantic tasks like translation, while token-level induction heads handle exact copying. The authors show that these two mechanisms operate independently, and ablating token heads leads models to paraphrase instead of copying. They argue concept-level heads may play a broader role in in-context learning.",
    "category": "uses_nnsight",
    "image": "/images/The-Dual-Route-Model-of-Induction.png"
  },
  {
    "title": "The Geometry of Refusal in Large Language Models: Concept Cones and Representational Independence",
    "authors": "Tom Wollschläger, Jannes Elstner, Simon Geisler, Vincent Cohen-Addad, Stephan Günnemann, Johannes Gasteiger",
    "venue": "ICML 2025",
    "year": 2025,
    "url": "https://arxiv.org/abs/2502.17420",
    "description": "Introduces a gradient-based method to identify and analyze internal mechanisms behind refusal in large language models. Challenging prior claims of a single refusal direction, the authors find multiple mechanistically independent directions and multi-dimensional concept structures that govern refusal. They introduce the concept of representational independence to capture both linear and nonlinear intervention effects, revealing that LLM safety behavior relies on more complex internal structures than previously thought.",
    "category": "uses_nnsight",
    "image": "/images/The-Geometry-of-Refusal-in-Large-Language-Models-Concept-Cones-and-Representational-Independence.png",
    "project_url": "https://github.com/yus100/truth_cones"
  },
  {
    "title": "Towards AI Safety via Interpretability and Oversight",
    "authors": "Subhash Kantamneni",
    "venue": "Dissertation",
    "year": 2025,
    "url": "https://dspace.mit.edu/handle/1721.1/162723",
    "description": "A dissertation advancing AI safety through three main contributions: (1) reverse-engineering addition in LLMs to discover helical number representations and the \"Clock\" algorithm for arithmetic, (2) rigorously evaluating sparse autoencoders under challenging conditions (data scarcity, noise, distribution shift), finding they don't consistently improve probing performance, and (3) introducing a quantitative framework for evaluating scalable oversight, analyzing four oversight games and deriving conditions for optimal Nested Scalable Oversight structures.",
    "category": "uses_nnsight"
  },
  {
    "title": "Understanding How CodeLLMs (Mis)Predict Types with Activation Steering",
    "authors": "Francesca Lucchetti, Arjun Guha",
    "venue": "BlackBoxNLP 2025",
    "year": 2025,
    "url": "https://arxiv.org/abs/2404.01903",
    "description": "Examines type prediction in code LLMs, constructing adversarial dataset where models initially predict correctly but fail after semantically irrelevant edits. Despite apparent shallow understanding, provides evidence that LLMs learn robust type prediction mechanisms that fail to activate in adversarial scenarios. Using activation steering to manipulate internal activations, restores accurate predictions on adversarial inputs, showing steering activates shared type prediction mechanism across Python and TypeScript, more effectively than in-context examples, demonstrating generalizable code semantic representations transferring across languages.",
    "category": "uses_ndif",
    "image": "/images/Understanding-How-CodeLLMs-MisPredict-Types-with-Activation-Steering.png",
    "project_url": "https://github.com/nuprl/codetrace"
  },
  {
    "title": "When and How Does CLIP Enable Domain and Compositional Generalization?",
    "authors": "Elias Kempf, Simon Schrodi, Max Argus, Thomas Brox",
    "venue": "ICML 2025",
    "year": 2025,
    "url": "https://arxiv.org/abs/2502.09507",
    "description": "Systematically investigates CLIP's generalization abilities by training models on controlled distributions with varying domain diversity and class exposure. Findings show domain diversity is essential for both domain and compositional generalization, though compositional generalization can be surprisingly weak when training contains suboptimal subsets of the test domain. Through data-centric and mechanistic analyses, the authors find successful generalization requires learning sufficiently shared representations in intermediate layers and circuits.",
    "category": "uses_nnsight",
    "image": "/images/When-and-How-Does-CLIP-Enable-Domain-and-Compositional-Generalization.png",
    "project_url": "https://github.com/lmb-freiburg/understanding-clip-ood"
  },
  {
    "title": "A generative benchmark creation framework for detecting common data table versions",
    "authors": "Daniel C. Fox, Aamod Khatiwada, Roee Shraga",
    "venue": "Association for Computing Machinery",
    "year": 2024,
    "url": "https://dl.acm.org/doi/abs/10.1145/3627673.3679157",
    "description": "Introduces a novel framework using large language models (LLMs) to generate benchmarks for data versioning, addressing the lack of standardized evaluation methods in the field. The authors release VerLLM-v1, a benchmark with detailed documentation, version lineage, and complex transformations, facilitating better development and evaluation of data versioning techniques.",
    "category": "uses_nnsight",
    "image": "/images/A-generative-benchmark-creation-framework-for-detecting-common-data-table-versions.png",
    "project_url": "https://github.com/danielcfox/genbenchver"
  },
  {
    "title": "A Primer on the Inner Workings of Transformer-based Language Models",
    "authors": "Javier Ferrando, Gabriele Sarti, Arianna Bisazza, Marta R. Costa-jussà",
    "venue": "ArXiv 2024",
    "year": 2024,
    "url": "https://arxiv.org/abs/2405.00208",
    "description": "A comprehensive primer offers a technical introduction to interpreting the inner workings of advanced language models, specifically those based on the Transformer architecture. By synthesizing years of research, this work provides a unified understanding of the internal mechanisms that drive these models, shedding light on their underlying connections and relationships. This overview helps to contextualize current approaches and inform future research directions in the field.",
    "category": "referencing",
    "image": "/images/A-Primer-on-the-Inner-Workings-of-Transformer-based-Language-Models.png"
  },
  {
    "title": "Benchmarking Mental State Representations in Language Models",
    "authors": "Matteo Bortoletto, Constantin Ruhdorfer, Lei Shi, Andreas Bulling",
    "venue": "ICML 2024 Workshop on Mechanistic Interpretability",
    "year": 2024,
    "url": "https://openreview.net/forum?id=yEwEVoH9Be",
    "description": "Conducts a benchmark study on the internal representation of mental states in language models, analyzing different model sizes, fine-tuning strategies, and prompt designs. Finds that the quality of belief representations improves with model size and fine-tuning but is sensitive to prompt variations. Extends previous activation editing experiments, showing that reasoning performance can be improved by steering model activations without training probes. First to investigate the impact of prompt variations on probing performance in Theory of Mind tasks.",
    "category": "uses_nnsight"
  },
  {
    "title": "Black-Box Access is Insufficient for Rigorous AI Audits",
    "authors": "Stephen Casper, Carson Ezell, Charlotte Siegmann, Noam Kolt, Taylor Lynn Curtis, Ben Bucknall, Andreas Haupt, Kevin Wei, Jérémy Scheurer, Marius Hobbhahn, Lee Sharkey, Satyapriya Krishna, Marvin Von Hagen, Silas Alberti, Alan Chan, Qinyi Sun, Michael Gerovitch, David Bau, Max Tegmark, David Krueger, Dylan Hadfield-Menell",
    "venue": "FAccT 2024",
    "year": 2024,
    "url": "https://doi.org/10.1145/3630106.3659037",
    "description": "Auditing AI systems is crucial for ensuring their reliability and accountability, but the effectiveness of these audits depends on the level of access granted to auditors. By examining the limitations of traditional black-box audits, researchers highlight the benefits of more comprehensive approaches, such as white-box and outside-the-box audits, which allow for more thorough evaluations and targeted scrutiny. Greater transparency and access can lead to more robust audits and better understanding of AI systems.",
    "category": "referencing",
    "image": "/images/Black-Box-Access-is-Insufficient-for-Rigorous-AI-Audits.png"
  },
  {
    "title": "Comgra: A Tool for Analyzing and Debugging Neural Networks",
    "authors": "Florian Dietz, Sophie Fellenz, Dietrich Klakow, Marius Kloft",
    "venue": "ArXiv 2024",
    "year": 2024,
    "url": "https://arxiv.org/abs/2407.21656",
    "description": "Researchers have developed a powerful tool to help demystify the inner workings of neural networks. Comgra, an open-source Python library, provides a user-friendly interface to visualize and analyze the internal behavior of these complex models, enabling faster debugging, more informed architecture design, and deeper understanding of how they make decisions. By making it easier to inspect and interpret neural networks, comgra has the potential to accelerate progress in a range of applications.",
    "category": "referencing",
    "image": "/images/Comgra-A-Tool-for-Analyzing-and-Debugging-Neural-Networks.png"
  },
  {
    "title": "Competition dynamics shape algorithmic phases of in-context learning",
    "authors": "Core Francisco Park, Ekdeep Singh Lubana, Itamar Pres, Hidenori Tanaka",
    "venue": "ICLR 2025",
    "year": 2024,
    "url": "https://arxiv.org/abs/2412.01003",
    "description": "Proposes synthetic sequence modeling task (simulating finite mixture of Markov chains) that reproduces well-known ICL results, offering unified setting for studying the concept. Decomposes model behavior into four algorithms combining fuzzy retrieval vs. inference with unigram vs. bigram statistics. These algorithms engage in competition dynamics, with experimental conditions (context size, training amount) dictating which algorithm dominates, revealing transient nature of ICL and suggesting it's best understood as mixture of algorithms rather than monolithic capability.",
    "category": "uses_ndif",
    "image": "/images/Competition-dynamics-shape-algorithmic-phases-of-in-context-learning.png"
  },
  {
    "title": "Developmentally plausible multimodal language models are highly modular",
    "authors": "A Klerings, C Bartelt, A Mueller",
    "venue": "ArXiv 2024",
    "year": 2024,
    "url": "https://aclanthology.org/2024.conll-babylm.10/",
    "description": "Researchers explore the concept of emergent modularity in large language models, where specialized components develop to tackle specific tasks, and consider the potential implications for understanding how these models process and generate language. By examining these modules, scientists can gain insight into the inner workings of language models and how they adapt to different tasks. This knowledge can help improve the design and performance of future language models.",
    "category": "uses_nnsight"
  },
  {
    "title": "Evaluating Open-Source Sparse Autoencoders on Disentangling Factual Knowledge in GPT-2 Small",
    "authors": "Maheep Chaudhary, Atticus Geiger",
    "venue": "ArXiv 2024",
    "year": 2024,
    "url": "https://arxiv.org/abs/2409.04478",
    "description": "Evaluates the utility of high-dimensional sparse autoencoders (SAEs) for causal analysis in mechanistic interpretability, using the RAVEL benchmark on GPT-2 small. Compares four SAEs to neurons as a baseline and linear features learned via distributed alignment search (DAS) as a skyline. Findings indicate that SAEs struggle to match the neuron baseline and fall significantly short of the DAS skyline in distinguishing between knowledge of a city's country and continent.",
    "category": "uses_nnsight",
    "image": "/images/Evaluating-Open-Source-Sparse-Autoencoders-on-Disentangling-Factual-Knowledge-in-GPT-2-Small.png",
    "project_url": "https://github.com/MaheepChaudhary/SAE-Ravel"
  },
  {
    "title": "Evidence of Learned Look-Ahead in a Chess-Playing Neural Network",
    "authors": "Erik Jenner, Shreyas Kapur, Vasil Georgiev, Cameron Allen, Scott Emmons, Stuart Russell",
    "venue": "NeurIPS",
    "year": 2024,
    "url": "https://arxiv.org/abs/2406.00877",
    "description": "Presents evidence of learned look-ahead in the policy network of Leela Chess Zero, showing that it internally represents future optimal moves, which are critical in certain board states. Demonstrates through activations, attention heads, and a probing model that neural networks can predict optimal moves ahead, providing a basis for understanding learned algorithmic capabilities in neural networks.",
    "category": "uses_nnsight",
    "image": "/images/Evidence-of-Learned-Look-Ahead-in-a-Chess-Playing-Neural-Network.png",
    "project_url": "https://leela-interp.github.io/"
  },
  {
    "title": "Hidden pieces: An analysis of linear probes for gpt representation edits",
    "authors": "Austin L Davis and Gita Sukthankar",
    "venue": "ICMLA",
    "year": 2024,
    "url": "https://ieeexplore.ieee.org/abstract/document/10903266/",
    "description": "Explores using probing classifiers to modify the internal hidden state of a chess-playing transformer, revealing that the model encodes an editable representation of the game state. By training linear classifiers, the authors demonstrate that they can reliably delete pieces from the board, showing the model's internal understanding of game dynamics.",
    "category": "uses_nnsight",
    "image": "/images/Hidden-pieces-An-analysis-of-linear-probes-for-gpt-representation-edits.png"
  },
  {
    "title": "How do llamas process multilingual text? a latent exploration through activation patching",
    "authors": "Clément Dumas, Veniamin Veselovsky, Giovanni Monea, Robert West, Chris Wendler",
    "venue": "ICML 2024 Workshop on Mechanistic Interpretability",
    "year": 2024,
    "url": "https://openreview.net/forum?id=0ku2hIm4BS",
    "description": "Analyzes Llama-2's forward pass during word translation tasks to explore whether it develops language-agnostic concept representations. Shows that language encoding occurs earlier than concept encoding and that activation patching can independently alter either the concept or the language. Demonstrates that averaging latents across languages does not hinder translation performance, providing evidence for universal concept representation in multilingual models.",
    "category": "uses_nnsight",
    "image": "/images/How-do-llamas-process-multilingual-text-a-latent-exploration-through-activation-patching.png"
  },
  {
    "title": "ICLR: In-Context Learning of Representations",
    "authors": "Core Francisco Park, Andrew Lee, Ekdeep Singh Lubana, Yongyi Yang, Maya Okawa, Kento Nishi, Martin Wattenberg, Hidenori Tanaka",
    "venue": "ICLR",
    "year": 2024,
    "url": "https://arxiv.org/abs/2501.00070",
    "description": "Explores whether large language models can reorganize internal concept representations based on in-context examples that conflict with pretrained semantics. Using a synthetic graph-tracing task, the authors show that sufficient context can trigger a reorganization of representations to match the graph's structure, though strong semantic priors can resist this shift. They interpret the behavior through the lens of energy minimization and argue that context length is a key factor in enabling flexible representation formation.",
    "category": "uses_ndif",
    "image": "/images/ICLR-In-Context-Learning-of-Representations.png"
  },
  {
    "title": "InterPLM: Discovering Interpretable Features in Protein Language Models via Sparse Autoencoders",
    "authors": "Elana Simon, James Zou",
    "venue": "Nature Methods 2024",
    "year": 2024,
    "url": "https://arxiv.org/abs/2412.12101",
    "description": "This paper introduces a method for interpreting protein language models (PLMs) by using sparse autoencoders (SAEs) to extract human-interpretable features from model embeddings. Applied to ESM-2, the approach reveals thousands of latent features per layer that align with known biological concepts like binding sites and structural motifs, far exceeding the interpretability of individual neurons. The authors also identify coherent, novel features that extend beyond current biological annotations and propose a language model-based pipeline to help interpret them. These features can aid in filling database gaps and steering protein design. The study presents InterPLM, a platform for exploring these representations, along with open-source tools for further analysis.",
    "category": "uses_nnsight",
    "image": "/images/InterPLM-Discovering-Interpretable-Features-in-Protein-Language-Models-via-Sparse-Autoencoders.png",
    "project_url": "https://github.com/ElanaPearl/interPLM"
  },
  {
    "title": "Language Models Represent Beliefs of Self and Others",
    "authors": "Wentao Zhu, Zhining Zhang, Yizhou Wang",
    "venue": "ICML",
    "year": 2024,
    "url": "https://arxiv.org/abs/2402.18496",
    "description": "Investigates the presence of Theory of Mind (ToM) abilities in large language models, identifying internal representations of self and others' beliefs through neural activations. Shows that manipulating these representations significantly alters ToM performance, highlighting their importance in social reasoning. Extends findings to various social reasoning tasks involving causal inference.",
    "category": "uses_nnsight",
    "image": "/images/Language-Models-Represent-Beliefs-of-Self-and-Others.png",
    "project_url": "https://walter0807.github.io/RepBelief/"
  },
  {
    "title": "Locating and Editing Factual Associations in Mamba",
    "authors": "Arnab Sen Sharma, David Atkinson, David Bau",
    "venue": "COLM",
    "year": 2024,
    "url": "https://arxiv.org/abs/2404.03646",
    "description": "Investigates factual recall mechanisms in the Mamba state space model, comparing it to autoregressive transformer models. Finds that key components responsible for factual recall are localized in middle layers and at specific token positions, mirroring patterns seen in transformers. Demonstrates that rank-one model editing can insert facts at particular locations and adapts attention-knockout techniques to analyze information flow. Despite architectural differences, the study concludes that Mamba and transformer models share significant similarities in factual recall processes.",
    "category": "uses_nnsight",
    "image": "/images/Locating-and-Editing-Factual-Associations-in-Mamba.png",
    "project_url": "https://romba.baulab.info/"
  },
  {
    "title": "Multi-property Steering of Large Language Models with Dynamic Activation Composition",
    "authors": "Daniel Scalena, Gabriele Sarti, Malvina Nissim",
    "venue": "BlackboxNLP 2024",
    "year": 2024,
    "url": "https://arxiv.org/abs/2406.17563",
    "description": "Researchers have developed a new approach to fine-tune language models, called Dynamic Activation Composition, which allows for more nuanced control over the model's output. This method addresses a key limitation of existing techniques, which can struggle to balance multiple competing goals, such as ensuring the generated text meets certain criteria while still sounding natural. By adapting to the specific needs of each generation task, this approach enables more effective and flexible language model conditioning.",
    "category": "uses_nnsight",
    "image": "/images/Multi-property-Steering-of-Large-Language-Models-with-Dynamic-Activation-Composition.png",
    "project_url": "https://github.com/DanielSc4/Dynamic-Activation-Composition"
  },
  {
    "title": "Penzai + Treescope: A Toolkit for Interpreting, Visualizing, and Editing Models As Data",
    "authors": "Daniel D. Johnson",
    "venue": "ICML 2024",
    "year": 2024,
    "url": "https://arxiv.org/abs/2408.00211",
    "description": "Researchers have developed a new library, Penzai, and visualization tool, Treescope, to make it easier to understand and modify complex machine learning models. By representing models as simple data structures, Penzai enables users to easily edit and intervene on model components, while Treescope provides immediate visual feedback. This approach streamlines the process of analyzing and refining machine learning models, allowing researchers to focus on improving model performance and interpretability.",
    "category": "referencing",
    "image": "/images/Penzai-Treescope-A-Toolkit-for-Interpreting-Visualizing-and-Editing-Models-As-Data.png"
  },
  {
    "title": "pyvene: A Library for Understanding and Improving PyTorch Models via Interventions",
    "authors": "Zhengxuan Wu, Atticus Geiger, Aryaman Arora, Jing Huang, Zheng Wang, Noah D. Goodman, Christopher D. Manning, Christopher Potts",
    "venue": "NAACL 2024",
    "year": 2024,
    "url": "https://arxiv.org/abs/2403.07809",
    "description": "Researchers have developed an open-source library called pyvene to simplify and standardize interventions on neural models, a crucial operation in various AI applications. By providing a unified framework for customizing and sharing interventions, pyvene aims to accelerate progress in areas like model editing, robustness, and interpretability. This library enables users to easily design and implement complex interventions, making it a valuable tool for the AI research community.",
    "category": "referencing",
    "image": "/images/pyvene-A-Library-for-Understanding-and-Improving-PyTorch-Models-via-Interventions.png"
  },
  {
    "title": "Sparse Autoencoders Reveal Temporal Difference Learning in Large Language Models",
    "authors": "Can Demircan, Tankred Saanum, Akshay K. Jagadish, Marcel Binz, Eric Schulz",
    "venue": "ICLR 2024",
    "year": 2024,
    "url": "https://arxiv.org/abs/2410.01280",
    "description": "Researchers are working to understand how large language models can learn from a few examples and adapt to solve specific problems, such as those involving reinforcement learning. By analyzing the internal workings of a model, they've discovered how it represents key concepts, like errors and values, despite not being explicitly trained for them. This study develops a new approach to studying and manipulating this type of learning, which could lead to a deeper understanding of how these models work.",
    "category": "referencing",
    "image": "/images/Sparse-Autoencoders-Reveal-Temporal-Difference-Learning-in-Large-Language-Models.png"
  },
  {
    "title": "Structured In-Context Task Representations",
    "authors": "Core Francisco Park, Andrew Lee, Ekdeep Singh Lubana, Kento Nishi, Maya Okawa, Hidenori Tanaka",
    "venue": "NeurIPS 2024 Workshop on Symmetry and Geometry in Neural Representations",
    "year": 2024,
    "url": "https://openreview.net/forum?id=hlOu6w1a8T",
    "description": "Investigates whether language models develop interpretable internal representations during in-context learning. Using synthetic data based on geometric structures like grids and rings, the authors show that models do form internal representations reflecting these structures. They also find that in-context examples can override existing semantic priors by shaping representations in new dimensions. The study concludes that language models can build meaningful internal representations from in-context examples alone.",
    "category": "uses_ndif",
    "image": "/images/Structured-In-Context-Task-Representations.png"
  },
  {
    "title": "The Quest for the Right Mediator: Surveying Mechanistic Interpretability for NLP Through the Lens of Causal Mediation Analysis",
    "authors": "Aaron Mueller, Jannik Brinkmann, Millicent Li, Samuel Marks, Koyena Pal, Nikhil Prakash, Can Rager, Aruna Sankaranarayanan, Arnab Sen Sharma, Jiuding Sun, Eric Todd, David Bau, Yonatan Belinkov",
    "venue": "Computational Linguistics 2024",
    "year": 2024,
    "url": "https://arxiv.org/abs/2408.01416",
    "description": "Researchers are working to better understand how language models make decisions, but the field of interpretability lacks a unified approach, making it hard to compare and build on existing techniques. To address this, a new framework is proposed, based on causal mediation analysis, which helps to categorize and evaluate different methods for understanding language model behavior. By providing a clearer structure for the field, this approach aims to guide researchers in choosing the most effective methods for their goals and inform future work in developing more transparent and accountable language models.",
    "category": "referencing",
    "image": "/images/The-Quest-for-the-Right-Mediator-Surveying-Mechanistic-Interpretability-for-NLP-Through-the-Lens-of-Causal-Mediation-Analysis.png"
  },
  {
    "title": "Token Erasure as a Footprint of Implicit Vocabulary Items in LLMs",
    "authors": "Sheridan Feucht, David Atkinson, Byron Wallace, David Bau",
    "venue": "EMNLP 2024",
    "year": 2024,
    "url": "https://arxiv.org/abs/2406.20086",
    "description": "Investigates how LLMs transform arbitrary groups of tokens into higher-level representations, focusing on multi-token words and named entities. Identifies a pronounced \"erasure\" effect where information about previous tokens is quickly forgotten in early layers. Proposes a method to probe the implicit vocabulary of LLMs by analyzing token representation changes across layers, providing results for Llama-2-7b and Llama-3-8B. This study represents the first effort to explore the implicit vocabulary of LLMs.",
    "category": "uses_nnsight",
    "image": "/images/Token-Erasure-as-a-Footprint-of-Implicit-Vocabulary-Items-in-LLMs.png",
    "project_url": "https://footprints.baulab.info/"
  },
  {
    "title": "What needs to go right for an induction head? A mechanistic study of in-context learning circuits and their formation",
    "authors": "Aaditya K. Singh, Ted Moskovitz, Felix Hill, Stephanie C. Y. Chan, Andrew M. Saxe",
    "venue": "ArXiv 2024",
    "year": 2024,
    "url": "https://arxiv.org/abs/2404.07129",
    "description": "Researchers have been studying a key component of transformer models called the induction head, which plays a crucial role in enabling these models to learn from context. To better understand how induction heads emerge and interact, scientists developed a new framework for analyzing and manipulating the model's internal workings, revealing the diverse and interconnected nature of these components. By uncovering the underlying mechanisms that drive the formation of induction heads, this research sheds light on the complex processes that enable transformer models to learn and adapt.",
    "category": "referencing",
    "image": "/images/What-needs-to-go-right-for-an-induction-head-A-mechanistic-study-of-in-context-learning-circuits-and-their-formation.png"
  },
  {
    "title": "CircuitTuning: Improving Math Reasoning in LLMs via Targeted Sub-Network Updates",
    "authors": "N Prakash, D Ren, D Moritz, Y Assogba",
    "venue": "ArXiv",
    "year": 0,
    "url": "https://openreview.net/forum?id=UVJeljeleU",
    "description": "Researchers have made progress in understanding how large language models (LLMs) work internally, identifying specialized subnetworks or \"circuits\" that handle particular tasks. By studying these circuits, scientists can gain insight into the complex decision-making processes of LLMs, ultimately shedding light on their strengths and limitations. This knowledge can inform the development of more efficient and effective language models.",
    "category": "uses_nnsight"
  },
  {
    "title": "Enhancing Logical Consistency in Language Models through Neuro-Symbolic Feedback and Structured Reasoning",
    "authors": "H Fartale, A Kattamuri, R Raja, A Vats, I Prasad…",
    "venue": "ArXiv",
    "year": 0,
    "url": "https://openreview.net/forum?id=X2qOD7uQxl",
    "description": "Researchers are working to address a key limitation of Large Language Models (LLMs): their inability to excel at tasks that demand explicit reasoning and understanding. By tackling this challenge, they aim to create more robust and reliable language models that can better handle complex tasks. This effort has significant implications for the development of more sophisticated language processing systems.",
    "category": "uses_nnsight"
  },
  {
    "title": "Linking forward-pass dynamics in Transformers and real-time processing in humans",
    "authors": "J Hu, MA Lepori, M Franke",
    "venue": "ArXiv",
    "year": 0,
    "url": "https://openreview.net/forum?id=qErnjDVfdN",
    "description": "Researchers are leveraging modern AI models to gain insights into human cognition, exploring whether these models can accurately predict human-derived measures and shed light on the underlying mechanisms of human thought and behavior. By using AI in this way, scientists can develop more nuanced theories of human cognition and better understand the complex processes that drive human perception, decision-making, and action. This approach has the potential to revolutionize the field of cognitive science.",
    "category": "uses_nnsight"
  }
]