Publications | Zaifeng Pan

2026

preprint

Lookahead Context Engineering: Hiding Context Transformation Overhead for Efficient Long-Horizon Agent Serving

Zaifeng Pan, Qianxu Wang, Zhengding Hu, Chang Chen, Yue Guan, Steven Swanson, and Yufei Ding

2026

@misc{smoothagent,
  author = {Pan, Zaifeng and Wang, Qianxu and Hu, Zhengding and Chen, Chang and Guan, Yue and Swanson, Steven and Ding, Yufei},
  title = {Lookahead Context Engineering: Hiding Context Transformation Overhead for Efficient Long-Horizon Agent Serving},
  howpublished = {\url{https://panzaifeng.github.io/assets/pdf/smoothagent.pdf}},
  year = {2026}
}

preprint

ChipBench: A Next-Step Benchmark for Evaluating LLM Performance in AI-Aided Chip Design

Zhongkai Yu, Chenyang Zhou, Yichen Lin, Hejia Zhang, Haotian Ye, Junxia Cui, Zaifeng Pan, Jishen Zhao, and Yufei Ding

arXiv preprint arXiv:2601.21448, 2026

Bib

@article{yu2026chipbench,
  title = {ChipBench: A Next-Step Benchmark for Evaluating LLM Performance in AI-Aided Chip Design},
  author = {Yu, Zhongkai and Zhou, Chenyang and Lin, Yichen and Zhang, Hejia and Ye, Haotian and Cui, Junxia and Pan, Zaifeng and Zhao, Jishen and Ding, Yufei},
  journal = {arXiv preprint arXiv:2601.21448},
  year = {2026}
}

preprint

Pancake: Hierarchical Memory System for Multi-Agent LLM Serving

Zhengding Hu, Zaifeng Pan, Prabhleen Kaur, Vibha Murthy, Zhongkai Yu, Yue Guan, Zhen Wang, Steven Swanson, and Yufei Ding

arXiv preprint arXiv:2602.21477, 2026

Bib

@article{hu2026pancake,
  title = {Pancake: Hierarchical Memory System for Multi-Agent LLM Serving},
  author = {Hu, Zhengding and Pan, Zaifeng and Kaur, Prabhleen and Murthy, Vibha and Yu, Zhongkai and Guan, Yue and Wang, Zhen and Swanson, Steven and Ding, Yufei},
  journal = {arXiv preprint arXiv:2602.21477},
  year = {2026}
}

ICML’26

ScaleSim: Serving Large-Scale Multi-Agent Simulation with Invocation Distance-Based Memory Management

Zaifeng Pan, Yipeng Shen, Zhengding Hu, Zhuang Wang, Aninda Manocha, Zheng Wang, Zhongkai Yu, Yue Guan, and Yufei Ding

arXiv preprint arXiv:2601.21473, 2026

Bib PDF Code

@article{pan2026scalesim,
  title = {ScaleSim: Serving Large-Scale Multi-Agent Simulation with Invocation Distance-Based Memory Management},
  author = {Pan, Zaifeng and Shen, Yipeng and Hu, Zhengding and Wang, Zhuang and Manocha, Aninda and Wang, Zheng and Yu, Zhongkai and Guan, Yue and Ding, Yufei},
  journal = {arXiv preprint arXiv:2601.21473},
  year = {2026}
}

2025

NeurIPS’25

KVFlow: Efficient Prefix Caching for Accelerating LLM-Based Multi-Agent Workflows

Zaifeng Pan, Ajjkumar Patel, Yipeng Shen, Zhengding Hu, Yue Guan, Wan-Lu Li, Lianhui Qin, Yida Wang, and Yufei Ding

In The Thirty-ninth Annual Conference on Neural Information Processing Systems, 2025

Bib PDF Code

@inproceedings{pan2025kvflow,
  title = {KVFlow: Efficient Prefix Caching for Accelerating LLM-Based Multi-Agent Workflows},
  author = {Pan, Zaifeng and Patel, Ajjkumar and Shen, Yipeng and Hu, Zhengding and Guan, Yue and Li, Wan-Lu and Qin, Lianhui and Wang, Yida and Ding, Yufei},
  booktitle = {The Thirty-ninth Annual Conference on Neural Information Processing Systems},
  year = {2025}
}

NeurIPS’25

Yggdrasil: Bridging Dynamic Speculation and Static Runtime for Latency-Optimal Tree-Based LLM Decoding

Yue Guan, Changming Yu, Shihan Fang, Weiming Hu, Zaifeng Pan, Zheng Wang, Zihan Liu, Yangjie Zhou, Yufei Ding, Minyi Guo, and Jingwen Leng

In The Thirty-ninth Annual Conference on Neural Information Processing Systems, 2025

Bib PDF

@inproceedings{guan2025yggdrasil,
  title = {Yggdrasil: Bridging Dynamic Speculation and Static Runtime for Latency-Optimal Tree-Based LLM Decoding},
  author = {Guan, Yue and Yu, Changming and Fang, Shihan and Hu, Weiming and Pan, Zaifeng and Wang, Zheng and Liu, Zihan and Zhou, Yangjie and Ding, Yufei and Guo, Minyi and Leng, Jingwen},
  booktitle = {The Thirty-ninth Annual Conference on Neural Information Processing Systems},
  year = {2025}
}

SOSP’25

Mercury: Unlocking Multi-GPU Operator Optimization for Large Language Models via Remote Memory Scheduling

Yue Guan, Xinwei Qiang, Zaifeng Pan, Daniels Johnson, Yuanwei Fang, Keren Zhou, Yuke Wang, Wanlu Li, Yufei Ding, and Adnan Aziz

In Proceedings of the 31th symposium on operating systems principles, 2025

Bib PDF

@inproceedings{guan2025mercury,
  title = {Mercury: Unlocking Multi-GPU Operator Optimization for Large Language Models via Remote Memory Scheduling},
  author = {Guan, Yue and Qiang, Xinwei and Pan, Zaifeng and Johnson, Daniels and Fang, Yuanwei and Zhou, Keren and Wang, Yuke and Li, Wanlu and Ding, Yufei and Aziz, Adnan},
  booktitle = {Proceedings of the 31th symposium on operating systems principles},
  year = {2025}
}

SOSP’25

HedraRAG: Co-Optimizing Generation and Retrieval for Heterogeneous Retrieval-Augmented Generation Workflows

Zhengding Hu, Vibha Murthy, Zaifeng Pan, Wanlu Li, Xiaoyi Fang, Yufei Ding, and Yuke Wang

In Proceedings of the 31th symposium on operating systems principles, 2025

Bib PDF

@inproceedings{hu2025hedrarag,
  title = {HedraRAG: Co-Optimizing Generation and Retrieval for Heterogeneous Retrieval-Augmented Generation Workflows},
  author = {Hu, Zhengding and Murthy, Vibha and Pan, Zaifeng and Li, Wanlu and Fang, Xiaoyi and Ding, Yufei and Wang, Yuke},
  booktitle = {Proceedings of the 31th symposium on operating systems principles},
  year = {2025}
}

OSDI’25

WLB-LLM: Workload-Balanced 4D Parallelism for Large Language Model Training

Zheng Wang, Anna Cai, Xinfeng Xie, Zaifeng Pan, Yue Guan, Weiwei Chu, Jie Wang, Shikai Li, Jianyu Huang, Chris Cai, Yuchen Hao, and Yufei Ding

In 19th USENIX Symposium on Operating Systems Design and Implementation, 2025

Bib PDF

@inproceedings{wang2025wlb,
  title = {WLB-LLM: Workload-Balanced 4D Parallelism for Large Language Model Training},
  author = {Wang, Zheng and Cai, Anna and Xie, Xinfeng and Pan, Zaifeng and Guan, Yue and Chu, Weiwei and Wang, Jie and Li, Shikai and Huang, Jianyu and Cai, Chris and Hao, Yuchen and Ding, Yufei},
  booktitle = {19th USENIX Symposium on Operating Systems Design and Implementation},
  year = {2025}
}

USENIX ATC’25

PluS: Highly Efficient and Expandable ML Compiler with Pluggable Graph Schedules

Ruofan Wu, Zhen Zheng, Feng Zhang, Chuanjie Liu, Zaifeng Pan, Jidong Zhai, and Xiaoyong Du

In USENIX Annual Technical Conference, 2025

Bib PDF

@inproceedings{wu2025plus,
  title = {PluS: Highly Efficient and Expandable ML Compiler with Pluggable Graph Schedules},
  author = {Wu, Ruofan and Zheng, Zhen and Zhang, Feng and Liu, Chuanjie and Pan, Zaifeng and Zhai, Jidong and Du, Xiaoyong},
  booktitle = {USENIX Annual Technical Conference},
  year = {2025}
}

MLSys’25

FastTree: Optimizing Attention Kernel and Runtime for Tree-Structured LLM Inference

Zaifeng Pan, Yitong Ding, Yue Guan, Zheng Wang, Zhongkai Yu, Xulong Tang, Yida Wang, and Yufei Ding

In Proceedings of Machine Learning and Systems, 2025

Bib PDF Code

@inproceedings{pan2025fasttree,
  title = {FastTree: Optimizing Attention Kernel and Runtime for Tree-Structured LLM Inference},
  author = {Pan, Zaifeng and Ding, Yitong and Guan, Yue and Wang, Zheng and Yu, Zhongkai and Tang, Xulong and Wang, Yida and Ding, Yufei},
  booktitle = {Proceedings of Machine Learning and Systems},
  year = {2025}
}

2024

SC’24

RecFlex: Enabling Feature Heterogeneity-Aware Optimization for Deep Recommendation Models with Flexible Schedules

Zaifeng Pan, Zhen Zheng, Feng Zhang, Bing Xie, Ruofan Wu, Shaden Smith, Chuanjie Liu, Olatunji Ruwase, Xiaoyong Du, and Yufei Ding

In International Conference for High Performance Computing, Networking, Storage and Analysis, 2024

Bib PDF Code

@inproceedings{pan2024recflex,
  title = {RecFlex: Enabling Feature Heterogeneity-Aware Optimization for Deep Recommendation Models with Flexible Schedules},
  author = {Pan, Zaifeng and Zheng, Zhen and Zhang, Feng and Xie, Bing and Wu, Ruofan and Smith, Shaden and Liu, Chuanjie and Ruwase, Olatunji and Du, Xiaoyong and Ding, Yufei},
  booktitle = {International Conference for High Performance Computing, Networking, Storage and Analysis},
  pages = {1--15},
  year = {2024},
  organization = {IEEE}
}

2023

SIGMOD’24

BladeDISC: Optimizing dynamic shape machine learning workloads via compiler approach

Zhen Zheng, Zaifeng Pan, Dalin Wang, Kai Zhu, Wenyi Zhao, Tianyou Guo, Xiafei Qiu, Minmin Sun, Junjie Bai, Feng Zhang, Xiaoyong Du, Jidong Zhai, and Wei Lin

Proceedings of the ACM on Management of Data, 2023

Bib PDF Code

@article{zheng2023bladedisc,
  title = {BladeDISC: Optimizing dynamic shape machine learning workloads via compiler approach},
  author = {Zheng, Zhen and Pan, Zaifeng and Wang, Dalin and Zhu, Kai and Zhao, Wenyi and Guo, Tianyou and Qiu, Xiafei and Sun, Minmin and Bai, Junjie and Zhang, Feng and Du, Xiaoyong and Zhai, Jidong and Lin, Wei},
  journal = {Proceedings of the ACM on Management of Data},
  volume = {1},
  number = {3},
  pages = {1--29},
  year = {2023},
  publisher = {ACM New York, NY, USA}
}

ASPLOS’23

RECom: A Compiler Approach to Accelerating Recommendation Model Inference with Massive Embedding Columns

Zaifeng Pan, Zhen Zheng, Feng Zhang, Ruofan Wu, Hao Liang, Dalin Wang, Xiafei Qiu, Junjie Bai, Wei Lin, and Xiaoyong Du

In Proceedings of the 28th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 4, 2023

🏆 Distinguished Artifact Award (presented at ASPLOS’24)

Bib PDF Code

@inproceedings{pan2023recom,
  title = {RECom: A Compiler Approach to Accelerating Recommendation Model Inference with Massive Embedding Columns},
  author = {Pan, Zaifeng and Zheng, Zhen and Zhang, Feng and Wu, Ruofan and Liang, Hao and Wang, Dalin and Qiu, Xiafei and Bai, Junjie and Lin, Wei and Du, Xiaoyong},
  booktitle = {Proceedings of the 28th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 4},
  pages = {268--286},
  year = {2023}
}

2021

TPDS

G-slide: A gpu-based sub-linear deep learning engine via lsh sparsification

Zaifeng Pan, Feng Zhang, Hourun Li, Chenyang Zhang, Xiaoyong Du, and Dong Deng

IEEE Transactions on Parallel and Distributed Systems, 2021

Bib PDF Code

@article{pan2021g,
  title = {G-slide: A gpu-based sub-linear deep learning engine via lsh sparsification},
  author = {Pan, Zaifeng and Zhang, Feng and Li, Hourun and Zhang, Chenyang and Du, Xiaoyong and Deng, Dong},
  journal = {IEEE Transactions on Parallel and Distributed Systems},
  volume = {33},
  number = {11},
  pages = {3015--3027},
  year = {2021},
  publisher = {IEEE}
}

TPDS

Exploring data analytics without decompression on embedded GPU systems

Zaifeng Pan, Feng Zhang, Yanliang Zhou, Jidong Zhai, Xipeng Shen, Onur Mutlu, and Xiaoyong Du

IEEE Transactions on Parallel and Distributed Systems, 2021

Bib PDF

@article{pan2021exploring,
  title = {Exploring data analytics without decompression on embedded GPU systems},
  author = {Pan, Zaifeng and Zhang, Feng and Zhou, Yanliang and Zhai, Jidong and Shen, Xipeng and Mutlu, Onur and Du, Xiaoyong},
  journal = {IEEE Transactions on Parallel and Distributed Systems},
  volume = {33},
  number = {7},
  pages = {1553--1568},
  year = {2021},
  publisher = {IEEE}
}

ICDE’21

G-TADOC: Enabling efficient GPU-based text analytics without decompression

Feng Zhang, Zaifeng Pan, Yanliang Zhou, Jidong Zhai, Xipeng Shen, Onur Mutlu, and Xiaoyong Du

In 2021 IEEE 37th International Conference on Data Engineering (ICDE), 2021

Bib PDF

@inproceedings{zhang2021g,
  title = {G-TADOC: Enabling efficient GPU-based text analytics without decompression},
  author = {Zhang, Feng and Pan, Zaifeng and Zhou, Yanliang and Zhai, Jidong and Shen, Xipeng and Mutlu, Onur and Du, Xiaoyong},
  booktitle = {2021 IEEE 37th International Conference on Data Engineering (ICDE)},
  pages = {1679--1690},
  year = {2021},
  organization = {IEEE}
}