Menu

Citing NDIF

If you use NNsight or NDIF resources in your research, please cite the following:

Citation

Jaden Fiotto-Kaufman, Alexander R. Loftus, Eric Todd, Jannik Brinkmann, Caden Juang, Koyena Pal, Can Rager, Aaron Mueller, Samuel Marks, Arnab Sen Sharma, Francesca Lucchetti, Michael Ripa, Adam Belfki, Nikhil Prakash, Sumeet Multani, Carla Brodley, Arjun Guha, Jonathan Bell, Byron Wallace, and David Bau. "NNsight and NDIF: Democratizing Access to Foundation Model Internals," 2024. arXiv preprint arXiv:2407.14561. Available at https://arxiv.org/abs/2407.14561.

BibTex

@article{fiotto2024nnsight,
  title={{NNsight} and {NDIF}: Democratizing Access to Foundation Model Internals},
  author={Fiotto-Kaufman, Jaden and Loftus, Alexander R and Todd, Eric and Brinkmann, Jannik and Juang, Caden and Pal, Koyena and Rager, Can and Mueller, Aaron and Marks, Samuel and Sharma, Arnab Sen and Lucchetti, Francesca and Ripa, Michael and Belfki, Adam and Prakash, Nikhil and Multani, Sumeet and Brodley, Carla and Guha, Arjun and Bell, Jonathan and Wallace, Byron and Bau, David},
  journal={arXiv preprint arXiv:2407.14561},
  year={2024}
}

In addition, when you publish work using NNsight or NDIF resources, we'd love you to email us directly at info@ndif.us to tell us about your work. This helps us track our impact and supports our continued efforts to provide open-source resources for reproducible and transparent research on large-scale AI systems.

Research Using NDIF

Evaluating Open-Source Sparse Autoencoders on Disentangling Factual Knowledge in GPT-2 Small

  @misc{chaudhary2024evaluatingopensourcesparseautoencoders,
    title={Evaluating Open-Source Sparse Autoencoders on Disentangling Factual Knowledge in GPT-2 Small}, 
    author={Maheep Chaudhary and Atticus Geiger},
    year={2024},
    eprint={2409.04478},
    archivePrefix={arXiv},
    primaryClass={cs.LG},
    url={https://arxiv.org/abs/2409.04478}, 
  }

Locating and Editing Factual Associations in Mamba

  @misc{sharma2024locatingeditingfactualassociations,
    title={Locating and Editing Factual Associations in Mamba}, 
    author={Arnab Sen Sharma and David Atkinson and David Bau},
    year={2024},
    eprint={2404.03646},
    archivePrefix={arXiv},
    primaryClass={cs.CL},
    url={https://arxiv.org/abs/2404.03646}
}

Benchmarking Mental State Representations in Language Models

  @misc{bortoletto2024benchmarkingmentalstaterepresentations,
    title={Benchmarking Mental State Representations in Language Models}, 
    author={Matteo Bortoletto and Constantin Ruhdorfer and Lei Shi and Andreas Bulling},
    year={2024},
    eprint={2406.17513},
    archivePrefix={arXiv},
    primaryClass={cs.CL},
    url={https://arxiv.org/abs/2406.17513}
}

Token Erasure as a Footprint of Implicit Vocabulary Items in LLMs

  @misc{feucht2024tokenerasurefootprintimplicit,
    title={Token Erasure as a Footprint of Implicit Vocabulary Items in LLMs}, 
    author={Sheridan Feucht and David Atkinson and Byron Wallace and David Bau},
    year={2024},
    eprint={2406.20086},
    archivePrefix={arXiv},
    primaryClass={cs.CL},
    url={https://arxiv.org/abs/2406.20086}
}

How do Llamas process multilingual text? A latent exploration through activation patching | OpenReview

  @inproceedings{dumas2024how,
    title={How do Llamas process multilingual text? A latent exploration through activation patching},
    author={Clément Dumas and Veniamin Veselovsky and Giovanni Monea and Robert West and Chris Wendler},
    booktitle={ICML 2024 Workshop on Mechanistic Interpretability},
    year={2024},
    url={https://openreview.net/forum?id=0ku2hIm4BS}
}

Evidence of Learned Look-Ahead in a Chess-Playing Neural Network

  @misc{jenner2024evidencelearnedlookaheadchessplaying,
    title={Evidence of Learned Look-Ahead in a Chess-Playing Neural Network}, 
    author={Erik Jenner and Shreyas Kapur and Vasil Georgiev and Cameron Allen and Scott Emmons and Stuart Russell},
    year={2024},
    eprint={2406.00877},
    archivePrefix={arXiv},
    primaryClass={cs.LG},
    url={https://arxiv.org/abs/2406.00877}
}

Language Models Represent Beliefs of Self and Others

  @misc{zhu2024languagemodelsrepresentbeliefs,
    title={Language Models Represent Beliefs of Self and Others}, 
    author={Wentao Zhu and Zhining Zhang and Yizhou Wang},
    year={2024},
    eprint={2402.18496},
    archivePrefix={arXiv},
    primaryClass={cs.AI},
    url={https://arxiv.org/abs/2402.18496}
}

Research Referencing NDIF

The Quest for the Right Mediator: A History, Survey, and Theoretical Grounding of Causal Interpretability

  @misc{mueller2024questrightmediatorhistory,
    title={The Quest for the Right Mediator: A History, Survey, and Theoretical Grounding of Causal Interpretability}, 
    author={Aaron Mueller and Jannik Brinkmann and Millicent Li and Samuel Marks and Koyena Pal and Nikhil Prakash and Can Rager and Aruna Sankaranarayanan and Arnab Sen Sharma and Jiuding Sun and Eric Todd and David Bau and Yonatan Belinkov},
    year={2024},
    eprint={2408.01416},
    archivePrefix={arXiv},
    primaryClass={cs.LG},
    url={https://arxiv.org/abs/2408.01416}
}

Penzai + Treescope: A Toolkit for Interpreting, Visualizing, and Editing Models As Data

  @misc{johnson2024penzaitreescopetoolkit,
    title={Penzai + Treescope: A Toolkit for Interpreting, Visualizing, and Editing Models As Data}, 
    author={Daniel D. Johnson},
    year={2024},
    eprint={2408.00211},
    archivePrefix={arXiv},
    primaryClass={cs.LG},
    url={https://arxiv.org/abs/2408.00211}
}

Comgra: A Tool for Analyzing and Debugging Neural Networks

  @misc{dietz2024comgratoolanalyzingdebugging,
    title={Comgra: A Tool for Analyzing and Debugging Neural Networks}, 
    author={Florian Dietz and Sophie Fellenz and Dietrich Klakow and Marius Kloft},
    year={2024},
    eprint={2407.21656},
    archivePrefix={arXiv},
    primaryClass={cs.LG},
    url={https://arxiv.org/abs/2407.21656}
}

Black-Box Access is Insufficient for Rigorous AI Audits

  @inproceedings{10.1145/3630106.3659037,
    author = {Casper, Stephen and Ezell, Carson and Siegmann, Charlotte and Kolt, Noam and Curtis, Taylor Lynn and Bucknall, Benjamin and Haupt, Andreas and Wei, Kevin and Scheurer, Jérémy and Hobbhahn, Marius and Sharkey, Lee and Krishna, Satyapriya and Von Hagen, Marvin and Alberti, Silas and Chan, Alan and Sun, Qinyi and Gerovitch, Michael and Bau, David and Tegmark, Max and Krueger, David and Hadfield-Menell, Dylan},
    title = {Black-Box Access is Insufficient for Rigorous AI Audits},
    year = {2024},
    isbn = {9798400704505},
    publisher = {Association for Computing Machinery},
    address = {New York, NY, USA},
    url = {https://doi.org/10.1145/3630106.3659037},
    doi = {10.1145/3630106.3659037},
    booktitle = {Proceedings of the 2024 ACM Conference on Fairness, Accountability, and Transparency},
    pages = {2254–2272},
    numpages = {19},
    keywords = {Adversarial Attacks, Auditing, Black-Box Access, Evaluation, Explainability, Fairness, Fine-Tuning, Governance, Interpretability, Policy, Regulation, Risk, White-Box Access},
    location = {Rio de Janeiro, Brazil},
    series = {FAccT '24}
    }
    

A Primer on the Inner Workings of Transformer-based Language Models

  @misc{ferrando2024primerinnerworkingstransformerbased,
    title={A Primer on the Inner Workings of Transformer-based Language Models}, 
    author={Javier Ferrando and Gabriele Sarti and Arianna Bisazza and Marta R. Costa-jussà},
    year={2024},
    eprint={2405.00208},
    archivePrefix={arXiv},
    primaryClass={cs.CL},
    url={https://arxiv.org/abs/2405.00208}
}

What needs to go right for an induction head? A mechanistic study of in-context learning circuits and their formation

  @misc{singh2024needsrightinductionhead,
    title={What needs to go right for an induction head? A mechanistic study of in-context learning circuits and their formation}, 
    author={Aaditya K. Singh and Ted Moskovitz and Felix Hill and Stephanie C. Y. Chan and Andrew M. Saxe},
    year={2024},
    eprint={2404.07129},
    archivePrefix={arXiv},
    primaryClass={cs.LG},
    url={https://arxiv.org/abs/2404.07129}
}

Pyvene: A Library for Understanding and Improving PyTorch Models via Interventions

  @misc{wu2024pyvenelibraryunderstandingimproving,
    title={pyvene: A Library for Understanding and Improving PyTorch Models via Interventions}, 
    author={Zhengxuan Wu and Atticus Geiger and Aryaman Arora and Jing Huang and Zheng Wang and Noah D. Goodman and Christopher D. Manning and Christopher Potts},
    year={2024},
    eprint={2403.07809},
    archivePrefix={arXiv},
    primaryClass={cs.LG},
    url={https://arxiv.org/abs/2403.07809}
}