{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,12]],"date-time":"2025-11-12T06:36:18Z","timestamp":1762929378952,"version":"3.45.0"},"reference-count":88,"publisher":"Springer Science and Business Media LLC","issue":"11","license":[{"start":{"date-parts":[[2025,8,28]],"date-time":"2025-08-28T00:00:00Z","timestamp":1756339200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/linproxy.fan.workers.dev:443\/https\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,8,28]],"date-time":"2025-08-28T00:00:00Z","timestamp":1756339200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/linproxy.fan.workers.dev:443\/https\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2025,11]]},"DOI":"10.1007\/s11263-025-02561-w","type":"journal-article","created":{"date-parts":[[2025,8,28]],"date-time":"2025-08-28T15:00:09Z","timestamp":1756393209000},"page":"8059-8077","update-policy":"https:\/\/linproxy.fan.workers.dev:443\/https\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Compressing Vision Transformer from the View of Model Property in Frequency Domain"],"prefix":"10.1007","volume":"133","author":[{"ORCID":"https:\/\/linproxy.fan.workers.dev:443\/https\/orcid.org\/0000-0002-4259-3073","authenticated-orcid":false,"given":"Zhenyu","family":"Wang","sequence":"first","affiliation":[]},{"given":"Xuemei","family":"Xie","sequence":"additional","affiliation":[]},{"given":"Hao","family":"Luo","sequence":"additional","affiliation":[]},{"given":"Tao","family":"Huang","sequence":"additional","affiliation":[]},{"given":"Weisheng","family":"Dong","sequence":"additional","affiliation":[]},{"given":"Kai","family":"Xiong","sequence":"additional","affiliation":[]},{"given":"Yongxu","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Xuyang","family":"Li","sequence":"additional","affiliation":[]},{"given":"Fan","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Guangming","family":"Shi","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,8,28]]},"reference":[{"key":"2561_CR1","unstructured":"Dosovitskiy, A., Beyer, L., Kolesnikov, A., Weissenborn, D., Zhai, X., Unterthiner, T., Dehghani, M., Minderer, M., Heigold, G., & Gelly, S., et al. (2020). An image is worth 16x16 words: Transformers for image recognition at scale. In International Conference on Learning Representations."},{"key":"2561_CR2","unstructured":"Touvron, H., Cord, M., Douze, M., Massa, F., Sablayrolles, A., & J\u00e9gou, H. (2021). Training data-efficient image transformers & distillation through attention. In International Conference on Machine Learning, pp. (10347\u201310357). PMLR."},{"key":"2561_CR3","doi-asserted-by":"crossref","unstructured":"Liu, Z., Lin, Y., Cao, Y., Hu, H., Wei, Y., Zhang, Z., Lin, S., & Guo, B. (2021). Swin transformer: Hierarchical vision transformer using shifted windows. In Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. (10012\u201310022).","DOI":"10.1109\/ICCV48922.2021.00986"},{"issue":"5","key":"2561_CR4","first-page":"6575","volume":"45","author":"L Yuan","year":"2022","unstructured":"Yuan, L., Hou, Q., Jiang, Z., Feng, J., & Yan, S. (2022). Volo: Vision outlooker for visual recognition. IEEE transactions on pattern analysis and machine intelligence, 45(5), 6575\u20136586.","journal-title":"IEEE transactions on pattern analysis and machine intelligence"},{"key":"2561_CR5","unstructured":"Yu, S., Chen, T., Shen, J., Yuan, H., Tan, J., Yang, S., Liu, J., & Wang, Z. (2021). Unified visual transformer compression. In International Conference on Learning Representations."},{"key":"2561_CR6","doi-asserted-by":"crossref","unstructured":"Yang, H., Yin, H., Shen, M., Molchanov, P., Li, H., & Kautz, J. (2023). Global vision transformer pruning with hessian-aware saliency. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. (18547\u201318557).","DOI":"10.1109\/CVPR52729.2023.01779"},{"key":"2561_CR7","unstructured":"Chen, T., Cheng, Y., Gan, Z., Yuan, L., Zhang, L., & Wang, Z. (2021). Chasing sparsity in vision transformers: An end-to-end exploration. Advances in Neural Information Processing Systems 34."},{"key":"2561_CR8","doi-asserted-by":"crossref","unstructured":"Tang, Y., Han, K., Wang, Y., Xu, C., Guo, J., Xu, C., & Tao, D. (2022). Patch slimming for efficient vision transformers. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. (2165\u201312174).","DOI":"10.1109\/CVPR52688.2022.01185"},{"key":"2561_CR9","unstructured":"Liang, Y., Ge, C., Tong, Z., Song, Y., & Xie, P., et al. (2022).: Not all patches are what you need: Expediting vision transformers via token reorganizations. In International Conference on Learning Representations (ICLR)."},{"key":"2561_CR10","unstructured":"Rao, Y., Zhao, W., Liu, B., Lu, J., Zhou, J., & Hsieh, C.-J. (2021). Dynamicvit: Efficient vision transformers with dynamic token sparsification. Advances in neural information processing systems 34."},{"key":"2561_CR11","unstructured":"Song, Z., Xu, Y., He, Z., Jiang, L., Jing, N., & Liang, X. (2022). Cp-vit: Cascade vision transformer pruning via progressive sparsity prediction. arXiv preprint arXiv:2203.04570."},{"key":"2561_CR12","unstructured":"Pan, B., Panda, R., Jiang, Y., Wang, Z., Feris, R., & Oliva, A. (2021). Ia-$$\\text{red}^2$$: Interpretability-aware redundancy reduction for vision transformers. Advances in Neural Information Processing Systems 34"},{"key":"2561_CR13","doi-asserted-by":"crossref","unstructured":"Molchanov, P., Mallya, A., Tyree, S., Frosio, I., & Kautz, J. (2019). Importance estimation for neural network pruning. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. (11264\u201311272).","DOI":"10.1109\/CVPR.2019.01152"},{"key":"2561_CR14","doi-asserted-by":"crossref","unstructured":"Bai, J., Yuan, L., Xia, S.-T., Yan, S., Li, Z., & Liu, W. (2022). Improving vision transformers by revisiting high-frequency components. In European Conference on Computer Vision, pp. 1\u201318. Springer.","DOI":"10.1007\/978-3-031-20053-3_1"},{"key":"2561_CR15","unstructured":"Wang, P., Zheng, W., Chen, T., & Wang, Z. (2021). Anti-oversmoothing in deep vision transformers via the fourier domain analysis: From theory to practice. In International Conference on Learning Representations."},{"key":"2561_CR16","unstructured":"Rao, Y., Zhao, W., Zhu, Z., Lu, J., & Zhou, J. (2021). Global filter networks for image classification. Advances in Neural Information Processing Systems,34."},{"key":"2561_CR17","unstructured":"Park, N., & Kim, S. (2021). How do vision transformers work? In International Conference on Learning Representations ."},{"key":"2561_CR18","doi-asserted-by":"crossref","unstructured":"Wang, H., Wu, X., Huang, Z., & Xing, E.P. (2020). High-frequency component helps explain the generalization of convolutional neural networks. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. (8684\u20138694).","DOI":"10.1109\/CVPR42600.2020.00871"},{"key":"2561_CR19","unstructured":"Wang, Z., Luo, H., WANG, P., Ding, F., Wang, F., & Li, H. (2022). VTC-LFC: Vision transformer compression with low-frequency components. In Oh, A.H., Agarwal, A., Belgrave, D., Cho, K. (eds.) Advances in Neural Information Processing Systems. https:\/\/linproxy.fan.workers.dev:443\/https\/openreview.net\/forum?id=HuiLIB6EaOk"},{"key":"2561_CR20","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A.N., Kaiser, \u0141., & Polosukhin, I. (2017). Attention is all you need. Advances in neural information processing systems 30"},{"key":"2561_CR21","doi-asserted-by":"crossref","unstructured":"Guo, J., Han, K., Wu, H., Tang, Y., Chen, X., Wang, Y., & Xu, C. (2022). Cmt: Convolutional neural networks meet vision transformers. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. (12175\u201312185).","DOI":"10.1109\/CVPR52688.2022.01186"},{"key":"2561_CR22","doi-asserted-by":"crossref","unstructured":"Chen, C.-F.R., Fan, Q., & Panda, R. (2021). Crossvit: Cross-attention multi-scale vision transformer for image classification. In Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. (357\u2013366).","DOI":"10.1109\/ICCV48922.2021.00041"},{"key":"2561_CR23","unstructured":"Ali, A., Touvron, H., Caron, M., Bojanowski, P., Douze, M., Joulin, A., Laptev, I., Neverova, N., Synnaeve, G., & Verbeek, J., et al. (2021). Xcit: Cross-covariance image transformers. Advances in neural information processing systems 34."},{"key":"2561_CR24","doi-asserted-by":"crossref","unstructured":"Graham, B., El-Nouby, A., Touvron, H., Stock, P., Joulin, A., J\u00e9gou, H., & Douze, M. (2021). Levit: a vision transformer in convnet\u2019s clothing for faster inference. In Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. (12259\u201312269).","DOI":"10.1109\/ICCV48922.2021.01204"},{"key":"2561_CR25","doi-asserted-by":"crossref","unstructured":"Wang, P., Wang, X., Wang, F., Lin, M., Chang, S., Li, H., & Jin, R. (2022). Kvt: k-nn attention for boosting vision transformers. In European Conference on Computer Vision, pp. (285\u2013302). Springer.","DOI":"10.1007\/978-3-031-20053-3_17"},{"key":"2561_CR26","unstructured":"Jiang, Z.-H., Hou, Q., Yuan, L., Zhou, D., Shi, Y., Jin, X., Wang, A., & Feng, J. (2021). All tokens matter: Token labeling for training better vision transformers. Advances in Neural Information Processing Systems 34."},{"key":"2561_CR27","unstructured":"Cheng, B., Schwing, A., & Kirillov, A. (2021). Per-pixel classification is not all you need for semantic segmentation. Advances in Neural Information Processing Systems 34."},{"key":"2561_CR28","doi-asserted-by":"crossref","unstructured":"Wang, Y., Xu, Z., Wang, X., Shen, C., Cheng, B., Shen, H., & Xia, H. (2021). End-to-end video instance segmentation with transformers. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. (8741\u20138750)","DOI":"10.1109\/CVPR46437.2021.00863"},{"key":"2561_CR29","doi-asserted-by":"crossref","unstructured":"Ding, L., Lin, D., Lin, S., Zhang, J., Cui, X., Wang, Y., Tang, H., & Bruzzone, L. (2022). Looking outside the window: Wide-context transformer for the semantic segmentation of high-resolution remote sensing images. IEEE Transactions on Geoscience and Remote Sensing","DOI":"10.1109\/TGRS.2022.3168697"},{"key":"2561_CR30","doi-asserted-by":"crossref","unstructured":"He, S., Luo, H., Wang, P., Wang, F., Li, H., & Jiang, W. (2021). Transreid: Transformer-based object re-identification. In Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. (15013\u201315022)","DOI":"10.1109\/ICCV48922.2021.01474"},{"key":"2561_CR31","doi-asserted-by":"crossref","unstructured":"Amini, A., Periyasamy, A.S., & Behnke, S. (2021). T6d-direct: Transformers for multi-object 6d pose direct regression. In DAGM German Conference on Pattern Recognition, pp. (530\u2013544). Springer","DOI":"10.1007\/978-3-030-92659-5_34"},{"key":"2561_CR32","unstructured":"Zhu, X., Su, W., Lu, L., Li, B., Wang, X., & Dai, J. (2020). Deformable detr: Deformable transformers for end-to-end object detection. In International Conference on Learning Representations."},{"key":"2561_CR33","unstructured":"Lan, Z., Chen, M., Goodman, S., Gimpel, K., Sharma, P., & Soricut, R. (2019). Albert: A lite bert for self-supervised learning of language representations. In International Conference on Learning Representations."},{"key":"2561_CR34","doi-asserted-by":"crossref","unstructured":"Zhang, J., Peng, H., Wu, K., Liu, M., Xiao, B., Fu, J., & Yuan, L. (2022). Minivit: Compressing vision transformers with weight multiplexing. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. (12145\u201312154).","DOI":"10.1109\/CVPR52688.2022.01183"},{"key":"2561_CR35","doi-asserted-by":"publisher","unstructured":"Xu, Y., Zhang, Z., Zhang, M., Sheng, K., Li, K., Dong, W., Zhang, L., Xu, C., & Sun, X. (2022). Evo-vit: Slow-fast token evolution for dynamic vision transformer. Proceedings of the AAAI Conference on Artificial Intelligence, 2964\u20132972 https:\/\/linproxy.fan.workers.dev:443\/https\/doi.org\/10.1609\/aaai.v36i3.20202","DOI":"10.1609\/aaai.v36i3.20202"},{"key":"2561_CR36","first-page":"18330","volume":"33","author":"W Zhou","year":"2020","unstructured":"Zhou, W., Xu, C., Ge, T., McAuley, J., Xu, K., & Wei, F. (2020). Bert loses patience: Fast and robust inference with early exit. Advances in Neural Information Processing Systems, 33, 18330\u201318341.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2561_CR37","doi-asserted-by":"crossref","unstructured":"Schwartz, R., Stanovsky, G., Swayamdipta, S., Dodge, J., & Smith, N.A. (2020). The right tool for the job: Matching model and instance complexities. In Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, pp. (6640\u20136651)","DOI":"10.18653\/v1\/2020.acl-main.593"},{"key":"2561_CR38","doi-asserted-by":"crossref","unstructured":"Wu, K., Zhang, J., Peng, H., Liu, M., Xiao, B., Fu, J., & Yuan, L. (2022). Tinyvit: Fast pretraining distillation for small vision transformers. In European Conference on Computer Vision, pp. 68\u201385. Springer.","DOI":"10.1007\/978-3-031-19803-8_5"},{"key":"2561_CR39","doi-asserted-by":"crossref","unstructured":"Molchanov, P., Mallya, A., Tyree, S., Frosio, I., & Kautz, J. (2020). Importance estimation for neural network pruning. In 2019 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR).","DOI":"10.1109\/CVPR.2019.01152"},{"key":"2561_CR40","doi-asserted-by":"crossref","unstructured":"He, Y., Liu, P., Wang, Z., Hu, Z., & Yang, Y. (2020). Filter pruning via geometric median for deep convolutional neural networks acceleration. In 2019 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR).","DOI":"10.1109\/CVPR42600.2020.00208"},{"key":"2561_CR41","doi-asserted-by":"crossref","unstructured":"Lin, M., Ji, R., Wang, Y., Zhang, Y., Zhang, B., Tian, Y., & Shao, L. (2020). Hrank: Filter pruning using high-rank feature map. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. (1529\u20131538).","DOI":"10.1109\/CVPR42600.2020.00160"},{"key":"2561_CR42","doi-asserted-by":"crossref","unstructured":"Liu, N., Ma, X., Xu, Z., Wang, Y., Tang, J., & Ye, J. (2020). Autocompress: An automatic dnn structured pruning framework for ultra-high compression rates. In Proceedings of the AAAI Conference on Artificial Intelligence, vol. 34, pp. (4876\u20134883).","DOI":"10.1609\/aaai.v34i04.5924"},{"key":"2561_CR43","doi-asserted-by":"crossref","unstructured":"Chavan, A., Shen, Z., Liu, Z., Liu, Z., Cheng, K.-T., & Xing, E.P. (2022). Vision transformer slimming: Multi-dimension searching in continuous optimization space. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. (4931\u20134941).","DOI":"10.1109\/CVPR52688.2022.00488"},{"key":"2561_CR44","doi-asserted-by":"crossref","unstructured":"Yu, F., Huang, K., Wang, M., Cheng, Y., Chu, W., & Cui, L. (2022). Width & depth pruning for vision transformers. In Proceedings of the AAAI Conference on Artificial Intelligence, vol. 36, pp. (3143\u20133151).","DOI":"10.1609\/aaai.v36i3.20222"},{"key":"2561_CR45","doi-asserted-by":"crossref","unstructured":"Yin, M., Uzkent, B., Shen, Y., Jin, H., & Yuan, B. (2023). Gohsp: A unified framework of graph and optimization-based heterogeneous structured pruning for vision transformer. In Proceedings of the AAAI Conference on Artificial Intelligence, vol. 37, pp. (10954\u201310962)","DOI":"10.1609\/aaai.v37i9.26298"},{"key":"2561_CR46","doi-asserted-by":"crossref","unstructured":"Tang, S., Lin, W., Ye, H., Ye, P., Yu, C., Li, B., & Chen, T. (2024). Enhanced sparsification via stimulative training. In European Conference on Computer Vision, pp. (18\u201336). Springer","DOI":"10.1007\/978-3-031-72983-6_2"},{"key":"2561_CR47","doi-asserted-by":"crossref","unstructured":"Liu, X., Wu, T., & Guo, G. (2023). Adaptive sparse vit: towards learnable adaptive token pruning by fully exploiting self-attention. In Proceedings of the Thirty-Second International Joint Conference on Artificial Intelligence, pp. (1222\u20131230)","DOI":"10.24963\/ijcai.2023\/136"},{"key":"2561_CR48","doi-asserted-by":"crossref","unstructured":"Ye, H., Yu, C., Ye, P., Xia, R., Tang, Y., Lu, J., Chen, T., & Zhang, B. (2024). Once for both: Single stage of importance and sparsity search for vision transformer compression. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. (5578\u20135588).","DOI":"10.1109\/CVPR52733.2024.00533"},{"key":"2561_CR49","doi-asserted-by":"crossref","unstructured":"Zhao, S., Zhu, L., Wang, X., & Yang, Y. (2022). Centerclip: Token clustering for efficient text-video retrieval. In Proceedings of the 45th International ACM SIGIR Conference on Research and Development in Information Retrieval, pp. (970\u2013981).","DOI":"10.1145\/3477495.3531950"},{"key":"2561_CR50","doi-asserted-by":"crossref","unstructured":"Bolya, D., & Hoffman, J. (2023). Token merging for fast stable diffusion. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. (4599\u20134603).","DOI":"10.1109\/CVPRW59228.2023.00484"},{"key":"2561_CR51","doi-asserted-by":"crossref","unstructured":"Kim, M., Gao, S., Hsu, Y.-C., Shen, Y., Jin, H.: Token fusion: Bridging the gap between token pruning and token merging. In Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. (1383\u20131392) (2024)","DOI":"10.1109\/WACV57701.2024.00141"},{"key":"2561_CR52","doi-asserted-by":"crossref","unstructured":"Wei, S., Ye, T., Zhang, S., Tang, Y., & Liang, J. (2023). Joint token pruning and squeezing towards more aggressive compression of vision transformers. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. (2092\u20132101)","DOI":"10.1109\/CVPR52729.2023.00208"},{"key":"2561_CR53","first-page":"54079","volume":"37","author":"DH Lee","year":"2025","unstructured":"Lee, D. H., & Hong, S. (2025). Learning to merge tokens via decoupled embedding for efficient vision transformers. Advances in Neural Information Processing Systems, 37, 54079\u201354104.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2561_CR54","doi-asserted-by":"crossref","unstructured":"Wang, H., Dedhia, B., & Jha, N.K. (2024). Zero-tprune: Zero-shot token pruning through leveraging of the attention graph in pre-trained transformers. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. (16070\u201316079).","DOI":"10.1109\/CVPR52733.2024.01521"},{"key":"2561_CR55","doi-asserted-by":"crossref","unstructured":"Jie, S., Tang, Y., Guo, J., Deng, Z.-H., Han, K., & Wang, Y. (2024). Token compensator: Altering inference cost of vision transformer without re-tuning. In European Conference on Computer Vision, pp. (76\u201394) . Springer","DOI":"10.1007\/978-3-031-72640-8_5"},{"issue":"3","key":"2561_CR56","doi-asserted-by":"publisher","first-page":"264","DOI":"10.1109\/TASSP.1976.1162805","volume":"24","author":"C Rader","year":"1976","unstructured":"Rader, C., & Brenner, N. (1976). A new principle for fast fourier transformation. IEEE Transactions on Acoustics, Speech, and Signal Processing, 24(3), 264\u2013266.","journal-title":"IEEE Transactions on Acoustics, Speech, and Signal Processing"},{"key":"2561_CR57","first-page":"3022","volume":"33","author":"T Dzanic","year":"2020","unstructured":"Dzanic, T., Shah, K., & Witherden, F. (2020). Fourier spectrum discrepancies in deep network generated images. Advances in neural information processing systems, 33, 3022\u20133032.","journal-title":"Advances in neural information processing systems"},{"key":"2561_CR58","doi-asserted-by":"crossref","unstructured":"Durall, R., Keuper, M., & Keuper, J. (2020). Watch your up-convolution: Cnn based generative deep neural networks are failing to reproduce spectral distributions. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. (7890\u20137899).","DOI":"10.1109\/CVPR42600.2020.00791"},{"key":"2561_CR59","doi-asserted-by":"crossref","unstructured":"Chandrasegaran, K., Tran, N.-T., & Cheung, N.-M. (2021). A closer look at fourier spectrum discrepancies for cnn-generated images detection. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. (7200\u20137209).","DOI":"10.1109\/CVPR46437.2021.00712"},{"key":"2561_CR60","first-page":"18126","volume":"34","author":"K Schwarz","year":"2021","unstructured":"Schwarz, K., Liao, Y., & Geiger, A. (2021). On the frequency bias of generative models. Advances in Neural Information Processing Systems, 34, 18126\u201318136.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2561_CR61","doi-asserted-by":"crossref","unstructured":"Kong, Z., Dong, P., Ma, X., Meng, X., Niu, W., Sun, M., Shen, X., Yuan, G., Ren, B., & Tang, H., et al. (2022). Spvit: Enabling faster vision transformers via latency-aware soft token pruning. In European Conference on Computer Vision, pp. (620\u2013640.) Springer","DOI":"10.1007\/978-3-031-20083-0_37"},{"key":"2561_CR62","doi-asserted-by":"crossref","unstructured":"Zhang, X., Zhou, X., Lin, M., & Sun, J. (2018). Shufflenet: An extremely efficient convolutional neural network for mobile devices. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. (6848\u20136856).","DOI":"10.1109\/CVPR.2018.00716"},{"key":"2561_CR63","unstructured":"Bolya, D., Fu, C.-Y., Dai, X., Zhang, P., Feichtenhofer, C., &Hoffman, J. (2023). Token merging: Your vit but faster. In The Eleventh International Conference on Learning Representations."},{"issue":"3","key":"2561_CR64","doi-asserted-by":"publisher","first-page":"211","DOI":"10.1007\/s11263-015-0816-y","volume":"115","author":"O Russakovsky","year":"2015","unstructured":"Russakovsky, O., Deng, J., Su, H., Krause, J., Satheesh, S., Ma, S., Huang, Z., Karpathy, A., Khosla, A., Bernstein, M., et al. (2015). Imagenet large scale visual recognition challenge. International journal of computer vision, 115(3), 211\u2013252.","journal-title":"International journal of computer vision"},{"key":"2561_CR65","doi-asserted-by":"crossref","unstructured":"Lin, T.-Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Doll\u00e1r, P., & Zitnick, C.L. (2014). Microsoft coco: Common objects in context. In Computer Vision\u2013ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6-12, 2014, Proceedings, Part V 13, pp. (740\u2013755). Springer.","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"2561_CR66","doi-asserted-by":"crossref","unstructured":"Zhou, B., Zhao, H., Puig, X., Fidler, S., Barriuso, A., & Torralba, A. (2017). Scene parsing through ade20k dataset. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. (633\u2013641).","DOI":"10.1109\/CVPR.2017.544"},{"key":"2561_CR67","unstructured":"Paszke, A., Gross, S., Chintala, S., Chanan, G., Yang, E., DeVito, Z., Lin, Z., Desmaison, A., Antiga, L., & Lerer, A. (2017). Automatic differentiation in pytorch."},{"key":"2561_CR68","doi-asserted-by":"crossref","unstructured":"Long, S., Zhao, Z., Pi, J., Wang, S., & Wang, J. (2023). Beyond attentive tokens: Incorporating token importance and diversity for efficient vision transformers. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. (10334\u201310343).","DOI":"10.1109\/CVPR52729.2023.00996"},{"key":"2561_CR69","doi-asserted-by":"crossref","unstructured":"He, H., Cai, J., Liu, J., Pan, Z., Zhang, J., Tao, D., & Zhuang, B. (2024). Pruning self-attentions into convolutional layers in single path. IEEE Transactions on Pattern Analysis and Machine Intelligence.","DOI":"10.1109\/TPAMI.2024.3355890"},{"key":"2561_CR70","doi-asserted-by":"publisher","first-page":"1442","DOI":"10.1162\/tacl_a_00436","volume":"9","author":"J Li","year":"2021","unstructured":"Li, J., Cotterell, R., & Sachan, M. (2021). Differentiable subset pruning of transformer heads. Transactions of the Association for Computational Linguistics, 9, 1442\u20131459.","journal-title":"Transactions of the Association for Computational Linguistics"},{"key":"2561_CR71","unstructured":"Goyal, S., Choudhury, A.R., Raje, S., Chakaravarthy, V., Sabharwal, Y., & Verma, A. (2020). Power-bert: Accelerating bert inference via progressive word-vector elimination. In International Conference on Machine Learning, pp. (3690\u20133699). PMLR."},{"key":"2561_CR72","doi-asserted-by":"crossref","unstructured":"Yuan, L., Chen, Y., Wang, T., Yu, W., Shi, Y., Jiang, Z.-H., Tay, F.E., Feng, J., & Yan, S. (2021). Tokens-to-token vit: Training vision transformers from scratch on imagenet. In Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. (558\u2013567).","DOI":"10.1109\/ICCV48922.2021.00060"},{"key":"2561_CR73","doi-asserted-by":"crossref","unstructured":"Heo, B., Park, S., Han, D., & Yun, S. (2024). Rotary position embedding for vision transformer. In European Conference on Computer Vision, pp. (289\u2013305). Springer","DOI":"10.1007\/978-3-031-72684-2_17"},{"key":"2561_CR74","unstructured":"Bai, Y., Wang, H., TAO, Z., Li, K., & Fu, Y. (2022). Dual lottery ticket hypothesis. In International Conference on Learning Representations. https:\/\/linproxy.fan.workers.dev:443\/https\/openreview.net\/forum?id=fOsN52jn25l."},{"key":"2561_CR75","doi-asserted-by":"crossref","unstructured":"Fang, G., Ma, X., Song, M., Mi, M.B., & Wang, X. (2023). Depgraph: Towards any structural pruning. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. (16091\u201316101)","DOI":"10.1109\/CVPR52729.2023.01544"},{"key":"2561_CR76","unstructured":"Balaji, A., Wu, Y., & Yoon, J. Cifar100 convolutional model based classification benchmark"},{"key":"2561_CR77","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., & Sun, J. (2016). Deep residual learning for image recognition. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. (770\u2013778).","DOI":"10.1109\/CVPR.2016.90"},{"key":"2561_CR78","unstructured":"Ross, T.-Y., & Doll\u00e1r, G. (2017). Focal loss for dense object detection. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. (2980\u20132988)."},{"key":"2561_CR79","doi-asserted-by":"crossref","unstructured":"Xiao, T., Liu, Y., Zhou, B., Jiang, Y., & Sun, J. (2018). Unified perceptual parsing for scene understanding. In Proceedings of the European Conference on Computer Vision (ECCV), pp. (418\u2013434).","DOI":"10.1007\/978-3-030-01228-1_26"},{"key":"2561_CR80","doi-asserted-by":"crossref","unstructured":"Wang, W., Xie, E., Li, X., Fan, D.-P., Song, K., Liang, D., Lu, T., Luo, P., & Shao, L. (2021). Pyramid vision transformer: A versatile backbone for dense prediction without convolutions. In Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. (568\u2013578).","DOI":"10.1109\/ICCV48922.2021.00061"},{"key":"2561_CR81","doi-asserted-by":"crossref","unstructured":"Hou, Z., & Kung, S.-Y. (2022). Multi-dimensional vision transformer compression via dependency guided gaussian process search. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. (3669\u20133678)","DOI":"10.1109\/CVPRW56347.2022.00411"},{"key":"2561_CR82","doi-asserted-by":"crossref","unstructured":"Liang, J., Cao, J., Sun, G., Zhang, K., Van\u00a0Gool, L., & Timofte, R. (2021). Swinir: Image restoration using swin transformer. In Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. (1833\u20131844)","DOI":"10.1109\/ICCVW54120.2021.00210"},{"key":"2561_CR83","doi-asserted-by":"crossref","unstructured":"Agustsson, E., & Timofte, R. (2017). Ntire 2017 challenge on single image super-resolution: Dataset and study. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition Workshops, pp. 126\u2013135.","DOI":"10.1109\/CVPRW.2017.150"},{"key":"2561_CR84","doi-asserted-by":"crossref","unstructured":"Bevilacqua, M., Roumy, A., Guillemot, C., & Alberi-Morel, M.L. (2012). Low-complexity single-image super-resolution based on nonnegative neighbor embedding.","DOI":"10.5244\/C.26.135"},{"key":"2561_CR85","doi-asserted-by":"crossref","unstructured":"Martin, D., Fowlkes, C., Tal, D., & Malik, J. (2001). A database of human segmented natural images and its application to evaluating segmentation algorithms and measuring ecological statistics. In Proceedings Eighth IEEE International Conference on Computer Vision. ICCV 2001, vol. 2, pp. (416\u2013423). IEEE","DOI":"10.1109\/ICCV.2001.937655"},{"key":"2561_CR86","doi-asserted-by":"crossref","unstructured":"Huang, J.-B., Singh, A., & Ahuja, N. (2015). Single image super-resolution from transformed self-exemplars. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. (5197\u20135206)","DOI":"10.1109\/CVPR.2015.7299156"},{"key":"2561_CR87","doi-asserted-by":"publisher","first-page":"21811","DOI":"10.1007\/s11042-016-4020-z","volume":"76","author":"Y Matsui","year":"2017","unstructured":"Matsui, Y., Ito, K., Aramaki, Y., Fujimoto, A., Ogawa, T., Yamasaki, T., & Aizawa, K. (2017). Sketch-based manga retrieval using manga109 dataset. Multimedia tools and applications, 76, 21811\u201321838.","journal-title":"Multimedia tools and applications"},{"key":"2561_CR88","unstructured":"Hu, E.J., shen, Wallis, P., Allen-Zhu, Z., Li, Y., Wang, S., Wang, L., & Chen, W. (2022). LoRA: Low-rank adaptation of large language models. In International Conference on Learning Representations. https:\/\/linproxy.fan.workers.dev:443\/https\/openreview.net\/forum?id=nZeVKeeFYf9"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/linproxy.fan.workers.dev:443\/https\/link.springer.com\/content\/pdf\/10.1007\/s11263-025-02561-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/linproxy.fan.workers.dev:443\/https\/link.springer.com\/article\/10.1007\/s11263-025-02561-w\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/linproxy.fan.workers.dev:443\/https\/link.springer.com\/content\/pdf\/10.1007\/s11263-025-02561-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,12]],"date-time":"2025-11-12T06:30:37Z","timestamp":1762929037000},"score":1,"resource":{"primary":{"URL":"https:\/\/linproxy.fan.workers.dev:443\/https\/link.springer.com\/10.1007\/s11263-025-02561-w"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,8,28]]},"references-count":88,"journal-issue":{"issue":"11","published-print":{"date-parts":[[2025,11]]}},"alternative-id":["2561"],"URL":"https:\/\/linproxy.fan.workers.dev:443\/https\/doi.org\/10.1007\/s11263-025-02561-w","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"type":"print","value":"0920-5691"},{"type":"electronic","value":"1573-1405"}],"subject":[],"published":{"date-parts":[[2025,8,28]]},"assertion":[{"value":"16 November 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"4 August 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"28 August 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}