{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,20]],"date-time":"2026-02-20T16:17:58Z","timestamp":1771604278428,"version":"3.50.1"},"reference-count":74,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2025,12,24]],"date-time":"2025-12-24T00:00:00Z","timestamp":1766534400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/linproxy.fan.workers.dev:443\/https\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,12,24]],"date-time":"2025-12-24T00:00:00Z","timestamp":1766534400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/linproxy.fan.workers.dev:443\/https\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62176062"],"award-info":[{"award-number":["62176062"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2026,1]]},"DOI":"10.1007\/s11263-025-02586-1","type":"journal-article","created":{"date-parts":[[2025,12,24]],"date-time":"2025-12-24T08:52:53Z","timestamp":1766566373000},"update-policy":"https:\/\/linproxy.fan.workers.dev:443\/https\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["MMRL++: Parameter-Efficient and Interaction-Aware Representation Learning for Vision-Language Models"],"prefix":"10.1007","volume":"134","author":[{"given":"Yuncheng","family":"Guo","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/linproxy.fan.workers.dev:443\/https\/orcid.org\/0000-0002-7096-1830","authenticated-orcid":false,"given":"Xiaodong","family":"Gu","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,12,24]]},"reference":[{"key":"2586_CR1","unstructured":"Radford, A., Kim, J.W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., & Clark, J. (2021) Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763 (2021). PMLR"},{"key":"2586_CR2","unstructured":"Jia, C., Yang, Y., Xia, Y., Chen, Y.-T., Parekh, Z., Pham, H., Le, Q., Sung, Y.-H., Li, Z., Duerig, T. (2021) Scaling up visual and vision-language representation learning with noisy text supervision. In: International Conference on Machine Learning, pp. 4904\u20134916 . PMLR"},{"key":"2586_CR3","first-page":"23716","volume":"35","author":"J-B Alayrac","year":"2022","unstructured":"Alayrac, J.-B., Donahue, J., Luc, P., Miech, A., Barr, I., Hasson, Y., Lenc, K., Mensch, A., Millican, K., & Reynolds, M. (2022). Flamingo: a visual language model for few-shot learning. Advances in neural information processing systems, 35, 23716\u201323736.","journal-title":"Advances in neural information processing systems"},{"key":"2586_CR4","unstructured":"Yao, L., Huang, R., Hou, L., Lu, G., Niu, M., Xu, H., Liang, X., Li, Z., Jiang, X., Xu, C. (2021) Filip: Fine-grained interactive language-image pre-training. arXiv preprint arXiv:2111.07783"},{"key":"2586_CR5","first-page":"72096","volume":"36","author":"S Huang","year":"2023","unstructured":"Huang, S., Dong, L., Wang, W., Hao, Y., Singhal, S., Ma, S., Lv, T., Cui, L., Mohammed, O. K., & Patra, B. (2023). Language is not all you need: Aligning perception with language models. Advances in Neural Information Processing Systems, 36, 72096\u201372109.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2586_CR6","unstructured":"Peng, Z., Wang, W., Dong, L., Hao, Y., Huang, S., Ma, S., Wei, F. (2023) Kosmos-2: Grounding Multimodal Large Language Models to the World . https:\/\/linproxy.fan.workers.dev:443\/https\/arxiv.org\/abs\/2306.14824"},{"key":"2586_CR7","doi-asserted-by":"crossref","unstructured":"Lin, J., Yin, H., Ping, W., Molchanov, P., Shoeybi, M., Han, S. (2024) Vila: On pre-training for visual language models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 26689\u201326699","DOI":"10.1109\/CVPR52733.2024.02520"},{"key":"2586_CR8","unstructured":"Oord, A., Li, Y., Vinyals, O. (2019) Representation Learning with Contrastive Predictive Coding. https:\/\/linproxy.fan.workers.dev:443\/https\/arxiv.org\/abs\/1807.03748"},{"key":"2586_CR9","doi-asserted-by":"crossref","unstructured":"Huang, C., Jiang, A., Feng, J., Zhang, Y., Wang, X., Wang, Y. (2024) Adapting visual-language models for generalizable anomaly detection in medical images. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11375\u201311385","DOI":"10.1109\/CVPR52733.2024.01081"},{"key":"2586_CR10","unstructured":"Zhao, Z., Liu, Y., Wu, H., Wang, M., Li, Y., Wang, S., Teng, L., Liu, D., Cui, Z., Wang, Q., Shen, D. (2024) CLIP in Medical Imaging: A Comprehensive Survey . https:\/\/linproxy.fan.workers.dev:443\/https\/arxiv.org\/abs\/2312.07353"},{"key":"2586_CR11","doi-asserted-by":"publisher","unstructured":"Wang, Z., Wu, Z., Agarwal, D., Sun, J. (2022) MedCLIP: Contrastive learning from unpaired medical images and text. In: Goldberg, Y., Kozareva, Z., Zhang, Y. (eds.) Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing, pp. 3876\u20133887. Association for Computational Linguistics, Abu Dhabi, United Arab Emirates . https:\/\/linproxy.fan.workers.dev:443\/https\/doi.org\/10.18653\/v1\/2022.emnlp-main.256 .","DOI":"10.18653\/v1\/2022.emnlp-main.256"},{"key":"2586_CR12","doi-asserted-by":"crossref","unstructured":"Barraco, M., Cornia, M., Cascianelli, S., Baraldi, L., Cucchiara, R. (2022) The unreasonable effectiveness of clip features for image captioning: an experimental analysis. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4662\u20134670","DOI":"10.1109\/CVPRW56347.2022.00512"},{"key":"2586_CR13","unstructured":"Mokady, R., Hertz, A., Bermano, A.H. (2021) ClipCap: CLIP Prefix for Image Captioning . https:\/\/linproxy.fan.workers.dev:443\/https\/arxiv.org\/abs\/2111.09734"},{"key":"2586_CR14","doi-asserted-by":"crossref","unstructured":"Tang, M., Wang, Z., Liu, Z., Rao, F., Li, D., Li, X. (2021) Clip4caption: Clip for video caption. In: Proceedings of the 29th ACM International Conference on Multimedia, pp. 4858\u20134862","DOI":"10.1145\/3474085.3479207"},{"key":"2586_CR15","doi-asserted-by":"crossref","unstructured":"\u00d6zdemir, \u00d6., Akag\u00fcnd\u00fcz, E. (2024) Enhancing visual question answering through question-driven image captions as prompts. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1562\u20131571","DOI":"10.1109\/CVPRW63382.2024.00163"},{"key":"2586_CR16","doi-asserted-by":"crossref","unstructured":"Ye, S., Kong, W., Yao, C., Ren, J., Jiang, X. (2023) Video question answering using clip-guided visual-text attention. In: 2023 IEEE International Conference on Image Processing (ICIP), pp. 81\u201385. IEEE","DOI":"10.1109\/ICIP49359.2023.10222286"},{"key":"2586_CR17","unstructured":"Wang, G., Ge, Y., Ding, X., Kankanhalli, M., Shan, Y. (2023) What Makes for Good Visual Tokenizers for Large Language Models? . https:\/\/linproxy.fan.workers.dev:443\/https\/arxiv.org\/abs\/2305.12223"},{"key":"2586_CR18","doi-asserted-by":"crossref","unstructured":"Parkhi, O.M., Vedaldi, A., Zisserman, A., Jawahar, C. (2012) Cats and dogs. In: 2012 IEEE Conference on Computer Vision and Pattern Recognition, pp. 3498\u20133505. IEEE","DOI":"10.1109\/CVPR.2012.6248092"},{"issue":"9","key":"2586_CR19","doi-asserted-by":"publisher","first-page":"2337","DOI":"10.1007\/s11263-022-01653-1","volume":"130","author":"K Zhou","year":"2022","unstructured":"Zhou, K., Yang, J., Loy, C. C., & Liu, Z. (2022). Learning to prompt for vision-language models. International Journal of Computer Vision, 130(9), 2337\u20132348.","journal-title":"International Journal of Computer Vision"},{"key":"2586_CR20","doi-asserted-by":"publisher","unstructured":"Lester, B., Al-Rfou, R., Constant, N.: The power of scale for parameter-efficient prompt tuning. In: Moens, M.-F., Huang, X., Specia, L., Yih, S.W.-t. (2021) (eds.) Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing, pp. 3045\u20133059. Association for Computational Linguistics, Online and Punta Cana, Dominican Republic . https:\/\/linproxy.fan.workers.dev:443\/https\/doi.org\/10.18653\/v1\/2021.emnlp-main.243 .","DOI":"10.18653\/v1\/2021.emnlp-main.243"},{"key":"2586_CR21","doi-asserted-by":"crossref","unstructured":"Khattak, M.U., Rasheed, H., Maaz, M., Khan, S., Khan, F.S. (2023) Maple: Multi-modal prompt learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 19113\u201319122","DOI":"10.1109\/CVPR52729.2023.01832"},{"issue":"2","key":"2586_CR22","doi-asserted-by":"publisher","first-page":"581","DOI":"10.1007\/s11263-023-01891-x","volume":"132","author":"P Gao","year":"2024","unstructured":"Gao, P., Geng, S., Zhang, R., Ma, T., Fang, R., Zhang, Y., Li, H., & Qiao, Y. (2024). Clip-adapter: Better vision-language models with feature adapters. International Journal of Computer Vision, 132(2), 581\u2013595.","journal-title":"International Journal of Computer Vision"},{"key":"2586_CR23","doi-asserted-by":"crossref","unstructured":"Yang, L., Zhang, R.-Y., Wang, Y., Xie, X. (2024) Mma: Multi-modal adapter for vision-language models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 23826\u201323837","DOI":"10.1109\/CVPR52733.2024.02249"},{"key":"2586_CR24","doi-asserted-by":"crossref","unstructured":"Guo, Y., Gu, X. (2025) Mmrl: Multi-modal representation learning for vision-language models. In: Proceedings of the Computer Vision and Pattern Recognition Conference, pp. 25015\u201325025","DOI":"10.1109\/CVPR52734.2025.02329"},{"issue":"2","key":"2586_CR25","first-page":"3","volume":"1","author":"EJ Hu","year":"2022","unstructured":"Hu, E. J., Shen, Y., Wallis, P., Allen-Zhu, Z., Li, Y., Wang, S., Wang, L., & Chen, W. (2022). Lora: Low-rank adaptation of large language models. ICLR, 1(2), 3.","journal-title":"ICLR"},{"key":"2586_CR26","unstructured":"Wu, X., Huang, S., Wei, F. (2024) Mixture of lora experts. arXiv preprint arXiv:2404.13628"},{"key":"2586_CR27","doi-asserted-by":"crossref","unstructured":"Huang, Y., Ye, P., Huang, C., Cao, J., Zhang, L., Li, B., Yu, G., Chen, T. (2025) Ders: Towards extremely efficient upcycled mixture-of-experts models. arXiv preprint arXiv:2503.01359","DOI":"10.1109\/CVPR52734.2025.00940"},{"key":"2586_CR28","doi-asserted-by":"publisher","unstructured":"Li, X.L., Liang, P. (2021) Prefix-tuning: Optimizing continuous prompts for generation. In: Zong, C., Xia, F., Li, W., Navigli, R. (eds.) Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers), pp. 4582\u20134597. Association for Computational Linguistics, Online . https:\/\/linproxy.fan.workers.dev:443\/https\/doi.org\/10.18653\/v1\/2021.acl-long.353 .","DOI":"10.18653\/v1\/2021.acl-long.353"},{"key":"2586_CR29","doi-asserted-by":"publisher","unstructured":"Liu, X., Ji, K., Fu, Y., Tam, W., Du, Z., Yang, Z., Tang, J. (2022) P-tuning: Prompt tuning can be comparable to fine-tuning across scales and tasks. In: Muresan, S., Nakov, P., Villavicencio, A. (eds.) Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers), pp. 61\u201368. Association for Computational Linguistics, Dublin, Ireland . https:\/\/linproxy.fan.workers.dev:443\/https\/doi.org\/10.18653\/v1\/2022.acl-short.8 .","DOI":"10.18653\/v1\/2022.acl-short.8"},{"key":"2586_CR30","doi-asserted-by":"crossref","unstructured":"Zhou, K., Yang, J., Loy, C.C., Liu, Z. (2022) Conditional prompt learning for vision-language models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16816\u201316825","DOI":"10.1109\/CVPR52688.2022.01631"},{"key":"2586_CR31","doi-asserted-by":"crossref","unstructured":"Lu, Y., Liu, J., Zhang, Y., Liu, Y., Tian, X. (2022) Prompt distribution learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5206\u20135215","DOI":"10.1109\/CVPR52688.2022.00514"},{"key":"2586_CR32","unstructured":"Chen, G., Yao, W., Song, X., Li, X., Rao, Y., Zhang, K.: (2023) PLOT: Prompt Learning with Optimal Transport for Vision-Language Models . https:\/\/linproxy.fan.workers.dev:443\/https\/arxiv.org\/abs\/2210.01253"},{"key":"2586_CR33","doi-asserted-by":"crossref","unstructured":"Yao, H., Zhang, R., Xu, C. (2023) Visual-language prompt tuning with knowledge-guided context optimization. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6757\u20136767","DOI":"10.1109\/CVPR52729.2023.00653"},{"key":"2586_CR34","doi-asserted-by":"crossref","unstructured":"Zhu, B., Niu, Y., Han, Y., Wu, Y., Zhang, H. (2023) Prompt-aligned gradient for prompt tuning. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 15659\u201315669","DOI":"10.1109\/ICCV51070.2023.01435"},{"key":"2586_CR35","doi-asserted-by":"crossref","unstructured":"Lee, D., Song, S., Suh, J., Choi, J., Lee, S., Kim, H.J. (2023) Read-only prompt optimization for vision-language few-shot learning. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 1401\u20131411","DOI":"10.1109\/ICCV51070.2023.00135"},{"key":"2586_CR36","doi-asserted-by":"crossref","unstructured":"Xu, C., Zhu, Y., Shen, H., Chen, B., Liao, Y., Chen, X., Wang, L. (2024) Progressive visual prompt learning with contrastive feature re-formation. International Journal of Computer Vision, 1\u201316","DOI":"10.1007\/s11263-024-02172-x"},{"key":"2586_CR37","doi-asserted-by":"crossref","unstructured":"Khattak, M.U., Wasim, S.T., Naseer, M., Khan, S., Yang, M.-H., Khan, F.S. (2023) Self-regulating prompts: Foundational model adaptation without forgetting. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 15190\u201315200","DOI":"10.1109\/ICCV51070.2023.01394"},{"key":"2586_CR38","doi-asserted-by":"crossref","unstructured":"Zhao, C., Wang, Y., Jiang, X., Shen, Y., Song, K., Li, D., Miao, D. (2024) Learning domain invariant prompt for vision-language models. IEEE Transactions on Image Processing","DOI":"10.1109\/TIP.2024.3362062"},{"key":"2586_CR39","doi-asserted-by":"crossref","unstructured":"Yao, H., Zhang, R., Xu, C. (2024) Tcp: Textual-based class-aware prompt tuning for visual-language model. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 23438\u201323448","DOI":"10.1109\/CVPR52733.2024.02212"},{"key":"2586_CR40","doi-asserted-by":"crossref","unstructured":"Zhang, R., Zhang, W., Fang, R., Gao, P., Li, K., Dai, J., Qiao, Y., Li, H. (2022) Tip-adapter: Training-free adaption of clip for few-shot classification. In: European Conference on Computer Vision, pp. 493\u2013510. Springer","DOI":"10.1007\/978-3-031-19833-5_29"},{"key":"2586_CR41","doi-asserted-by":"crossref","unstructured":"Zanella, M., Ben\u00a0Ayed, I. (2024) Low-rank few-shot adaptation of vision-language models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1593\u20131603","DOI":"10.1109\/CVPRW63382.2024.00166"},{"key":"2586_CR42","doi-asserted-by":"crossref","unstructured":"McCloskey, M., Cohen, N.J. (1989) Catastrophic interference in connectionist networks: The sequential learning problem. In: Psychology of Learning and Motivation vol. 24, pp. 109\u2013165. Elsevier, ???","DOI":"10.1016\/S0079-7421(08)60536-8"},{"issue":"13","key":"2586_CR43","doi-asserted-by":"publisher","first-page":"3521","DOI":"10.1073\/pnas.1611835114","volume":"114","author":"J Kirkpatrick","year":"2017","unstructured":"Kirkpatrick, J., Pascanu, R., Rabinowitz, N., Veness, J., Desjardins, G., Rusu, A. A., Milan, K., Quan, J., Ramalho, T., & Grabska-Barwinska, A. (2017). Overcoming catastrophic forgetting in neural networks. Proceedings of the national academy of sciences, 114(13), 3521\u20133526.","journal-title":"Proceedings of the national academy of sciences"},{"key":"2586_CR44","doi-asserted-by":"crossref","unstructured":"Rebuffi, S.-A., Kolesnikov, A., Sperl, G., Lampert, C.H. (2017) icarl: Incremental classifier and representation learning. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2001\u20132010","DOI":"10.1109\/CVPR.2017.587"},{"key":"2586_CR45","unstructured":"Rusu, A.A., Rabinowitz, N.C., Desjardins, G., Soyer, H., Kirkpatrick, J., Kavukcuoglu, K., Pascanu, R., Hadsell, R. (2016) Progressive neural networks. arXiv preprint arXiv:1606.04671"},{"key":"2586_CR46","doi-asserted-by":"crossref","unstructured":"Liang, Y.-S., Li, W.-J. (2024) Inflora: Interference-free low-rank adaptation for continual learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 23638\u201323647","DOI":"10.1109\/CVPR52733.2024.02231"},{"key":"2586_CR47","unstructured":"Vaswani, A. (2017) Attention is all you need. Advances in Neural Information Processing Systems"},{"key":"2586_CR48","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.-J., Li, K., Fei-Fei, L. (2009) Imagenet: A large-scale hierarchical image database. In: 2009 IEEE Conference on Computer Vision and Pattern Recognition, pp. 248\u2013255 . Ieee","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"2586_CR49","unstructured":"Fei-Fei, L., Fergus, R., Perona, P. (2004) Learning generative visual models from few training examples: An incremental bayesian approach tested on 101 object categories. In: 2004 Conference on Computer Vision and Pattern Recognition Workshop, pp. 178\u2013178 . IEEE"},{"key":"2586_CR50","doi-asserted-by":"crossref","unstructured":"Krause, J., Stark, M., Deng, J., Fei-Fei, L. (2013) 3d object representations for fine-grained categorization. In: Proceedings of the IEEE International Conference on Computer Vision Workshops, pp. 554\u2013561","DOI":"10.1109\/ICCVW.2013.77"},{"key":"2586_CR51","doi-asserted-by":"crossref","unstructured":"Nilsback, M.-E., Zisserman, A. (2008) Automated flower classification over a large number of classes. In: 2008 Sixth Indian Conference on Computer Vision, Graphics & Image Processing, pp. 722\u2013729 . IEEE","DOI":"10.1109\/ICVGIP.2008.47"},{"key":"2586_CR52","doi-asserted-by":"crossref","unstructured":"Bossard, L., Guillaumin, M., Van\u00a0Gool, L. (2014) Food-101\u2013mining discriminative components with random forests. In: Computer vision\u2013ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6-12, 2014, Proceedings, Part VI 13, pp. 446\u2013461 . Springer","DOI":"10.1007\/978-3-319-10599-4_29"},{"key":"2586_CR53","unstructured":"Maji, S., Rahtu, E., Kannala, J., Blaschko, M., Vedaldi, A. (2013) Fine-grained visual classification of aircraft. arXiv preprint arXiv:1306.5151"},{"key":"2586_CR54","doi-asserted-by":"crossref","unstructured":"Xiao, J., Hays, J., Ehinger, K.A., Oliva, A., Torralba, A. (2010) Sun database: Large-scale scene recognition from abbey to zoo. In: 2010 IEEE Computer Society Conference on Computer Vision and Pattern Recognition, pp. 3485\u20133492 . IEEE","DOI":"10.1109\/CVPR.2010.5539970"},{"key":"2586_CR55","doi-asserted-by":"crossref","unstructured":"Nilsback, M.-E., Zisserman, A. (2008) Automated flower classification over a large number of classes. In: 2008 Sixth Indian Conference on Computer Vision, Graphics & Image Processing, pp. 722\u2013729 . IEEE","DOI":"10.1109\/ICVGIP.2008.47"},{"key":"2586_CR56","doi-asserted-by":"crossref","unstructured":"Cimpoi, M., Maji, S., Kokkinos, I., Mohamed, S., Vedaldi, A. (2014) Describing textures in the wild. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3606\u20133613","DOI":"10.1109\/CVPR.2014.461"},{"issue":"7","key":"2586_CR57","doi-asserted-by":"publisher","first-page":"2217","DOI":"10.1109\/JSTARS.2019.2918242","volume":"12","author":"P Helber","year":"2019","unstructured":"Helber, P., Bischke, B., Dengel, A., & Borth, D. (2019). Eurosat: A novel dataset and deep learning benchmark for land use and land cover classification. IEEE Journal of Selected Topics in Applied Earth Observations and Remote Sensing, 12(7), 2217\u20132226.","journal-title":"IEEE Journal of Selected Topics in Applied Earth Observations and Remote Sensing"},{"key":"2586_CR58","unstructured":"Recht, B., Roelofs, R., Schmidt, L., Shankar, V. (2019) Do imagenet classifiers generalize to imagenet? In: International Conference on Machine Learning, pp. 5389\u20135400 . PMLR"},{"key":"2586_CR59","unstructured":"Wang, H., Ge, S., Lipton, Z., Xing, E.P. (2019) Learning robust global representations by penalizing local predictive power. Advances in Neural Information Processing Systems 32"},{"key":"2586_CR60","doi-asserted-by":"crossref","unstructured":"Hendrycks, D., Zhao, K., Basart, S., Steinhardt, J., Song, D. (2021) Natural adversarial examples. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15262\u201315271","DOI":"10.1109\/CVPR46437.2021.01501"},{"key":"2586_CR61","doi-asserted-by":"crossref","unstructured":"Hendrycks, D., Basart, S., Mu, N., Kadavath, S., Wang, F., Dorundo, E., Desai, R., Zhu, T., Parajuli, S., Guo, M. (2021) The many faces of robustness: A critical analysis of out-of-distribution generalization. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 8340\u20138349","DOI":"10.1109\/ICCV48922.2021.00823"},{"key":"2586_CR62","doi-asserted-by":"crossref","unstructured":"Lin, T.-Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Doll\u00e1r, P., Zitnick, C.L. (2014) Microsoft coco: Common objects in context. In: European Conference on Computer Vision, pp. 740\u2013755 . Springer","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"2586_CR63","doi-asserted-by":"crossref","unstructured":"Antol, S., Agrawal, A., Lu, J., Mitchell, M., Batra, D., Zitnick, C.L., Parikh, D. (2015) Vqa: Visual question answering. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2425\u20132433","DOI":"10.1109\/ICCV.2015.279"},{"key":"2586_CR64","doi-asserted-by":"crossref","unstructured":"Goyal, Y., Khot, T., Summers-Stay, D., Batra, D., Parikh, D. (2017) Making the v in vqa matter: Elevating the role of image understanding in visual question answering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6904\u20136913","DOI":"10.1109\/CVPR.2017.670"},{"key":"2586_CR65","unstructured":"Dosovitskiy, A., Beyer, L., Kolesnikov, A., Weissenborn, D., Zhai, X., Unterthiner, T., Dehghani, M., Minderer, M., Heigold, G., Gelly, S., Uszkoreit, J., Houlsby, N. (2021) An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale . https:\/\/linproxy.fan.workers.dev:443\/https\/arxiv.org\/abs\/2010.11929"},{"key":"2586_CR66","unstructured":"Roy, S., Etemad, A. (2024) Consistency-guided prompt learning for vision-language models."},{"key":"2586_CR67","doi-asserted-by":"crossref","unstructured":"Wang, Y., Jiang, X., Cheng, D., Li, D., Zhao, C. (2024) Learning hierarchical prompt with structured linguistic knowledge for vision-language models. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 38, pp. 5749\u20135757","DOI":"10.1609\/aaai.v38i6.28387"},{"key":"2586_CR68","doi-asserted-by":"crossref","unstructured":"Tian, X., Zou, S., Yang, Z., Zhang, J. (2024) Argue: Attribute-guided prompt tuning for vision-language models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 28578\u201328587","DOI":"10.1109\/CVPR52733.2024.02700"},{"key":"2586_CR69","doi-asserted-by":"crossref","unstructured":"Yang, L., Zhang, R.-Y., Chen, Q., Xie, X. (2025) Learning with enriched inductive biases for vision-language models. International Journal of Computer Vision, 1\u201316","DOI":"10.1007\/s11263-025-02354-1"},{"key":"2586_CR70","doi-asserted-by":"crossref","unstructured":"Li, Z., Li, X., Fu, X., Zhang, X., Wang, W., Chen, S., Yang, J. (2024) Promptkd: Unsupervised prompt distillation for vision-language models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 26617\u201326626","DOI":"10.1109\/CVPR52733.2024.02513"},{"key":"2586_CR71","doi-asserted-by":"crossref","unstructured":"Lee, K.-H., Chen, X., Hua, G., Hu, H., He, X. (2018) Stacked cross attention for image-text matching. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 201\u2013216","DOI":"10.1007\/978-3-030-01225-0_13"},{"key":"2586_CR72","doi-asserted-by":"crossref","unstructured":"Chen, H., Ding, G., Liu, X., Lin, Z., Liu, J., Han, J. (2020) Imram: Iterative matching with recurrent attention memory for cross-modal image-text retrieval. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 12655\u201312663","DOI":"10.1109\/CVPR42600.2020.01267"},{"key":"2586_CR73","doi-asserted-by":"crossref","unstructured":"Zhang, K., Mao, Z., Wang, Q., Zhang, Y. (2022) Negative-aware attention framework for image-text matching. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15661\u201315670","DOI":"10.1109\/CVPR52688.2022.01521"},{"key":"2586_CR74","doi-asserted-by":"crossref","unstructured":"Pan, Z., Wu, F., Zhang, B. (2023) Fine-grained image-text matching by cross-modal hard aligning network. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 19275\u201319284","DOI":"10.1109\/CVPR52729.2023.01847"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/linproxy.fan.workers.dev:443\/https\/link.springer.com\/content\/pdf\/10.1007\/s11263-025-02586-1.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/linproxy.fan.workers.dev:443\/https\/link.springer.com\/article\/10.1007\/s11263-025-02586-1","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/linproxy.fan.workers.dev:443\/https\/link.springer.com\/content\/pdf\/10.1007\/s11263-025-02586-1.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,2,20]],"date-time":"2026-02-20T15:43:14Z","timestamp":1771602194000},"score":1,"resource":{"primary":{"URL":"https:\/\/linproxy.fan.workers.dev:443\/https\/link.springer.com\/10.1007\/s11263-025-02586-1"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,12,24]]},"references-count":74,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2026,1]]}},"alternative-id":["2586"],"URL":"https:\/\/linproxy.fan.workers.dev:443\/https\/doi.org\/10.1007\/s11263-025-02586-1","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"value":"0920-5691","type":"print"},{"value":"1573-1405","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,12,24]]},"assertion":[{"value":"14 May 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"2 December 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"24 December 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}],"article-number":"11"}}