{"id":"https://openalex.org/W4402716330","doi":"https://doi.org/10.1109/cvpr52733.2024.02527","title":"Monkey: Image Resolution and Text Label are Important Things for Large Multi-Modal Models","display_name":"Monkey: Image Resolution and Text Label are Important Things for Large Multi-Modal Models","publication_year":2024,"publication_date":"2024-06-16","ids":{"openalex":"https://openalex.org/W4402716330","doi":"https://doi.org/10.1109/cvpr52733.2024.02527"},"language":"en","primary_location":{"id":"doi:10.1109/cvpr52733.2024.02527","is_oa":false,"landing_page_url":"https://doi.org/10.1109/cvpr52733.2024.02527","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100418968","display_name":"Zhang Li","orcid":"https://orcid.org/0000-0003-1659-0466"},"institutions":[{"id":"https://openalex.org/I47720641","display_name":"Huazhong University of Science and Technology","ror":"https://ror.org/00p991c53","country_code":"CN","type":"education","lineage":["https://openalex.org/I47720641"]}],"countries":["CN"],"is_corresponding":true,"raw_author_name":"Zhang Li","raw_affiliation_strings":["Huazhong University of Science and Technology"],"affiliations":[{"raw_affiliation_string":"Huazhong University of Science and Technology","institution_ids":["https://openalex.org/I47720641"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101559943","display_name":"Biao Yang","orcid":"https://orcid.org/0000-0002-2337-3649"},"institutions":[{"id":"https://openalex.org/I47720641","display_name":"Huazhong University of Science and Technology","ror":"https://ror.org/00p991c53","country_code":"CN","type":"education","lineage":["https://openalex.org/I47720641"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Biao Yang","raw_affiliation_strings":["Huazhong University of Science and Technology"],"affiliations":[{"raw_affiliation_string":"Huazhong University of Science and Technology","institution_ids":["https://openalex.org/I47720641"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100409431","display_name":"Qiang Liu","orcid":"https://orcid.org/0000-0001-7531-4459"},"institutions":[{"id":"https://openalex.org/I4210108461","display_name":"Kingsoft (China)","ror":"https://ror.org/01stnfn33","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210108461"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Qiang Liu","raw_affiliation_strings":["Kingsoft Office"],"affiliations":[{"raw_affiliation_string":"Kingsoft Office","institution_ids":["https://openalex.org/I4210108461"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101315067","display_name":"Zhiyin Ma","orcid":"https://orcid.org/0009-0007-9217-8111"},"institutions":[{"id":"https://openalex.org/I47720641","display_name":"Huazhong University of Science and Technology","ror":"https://ror.org/00p991c53","country_code":"CN","type":"education","lineage":["https://openalex.org/I47720641"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Zhiyin Ma","raw_affiliation_strings":["Huazhong University of Science and Technology"],"affiliations":[{"raw_affiliation_string":"Huazhong University of Science and Technology","institution_ids":["https://openalex.org/I47720641"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5115595960","display_name":"Shuo Zhang","orcid":"https://orcid.org/0009-0002-1303-793X"},"institutions":[{"id":"https://openalex.org/I47720641","display_name":"Huazhong University of Science and Technology","ror":"https://ror.org/00p991c53","country_code":"CN","type":"education","lineage":["https://openalex.org/I47720641"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shuo Zhang","raw_affiliation_strings":["Huazhong University of Science and Technology"],"affiliations":[{"raw_affiliation_string":"Huazhong University of Science and Technology","institution_ids":["https://openalex.org/I47720641"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5008010573","display_name":"Jingxu Yang","orcid":null},"institutions":[{"id":"https://openalex.org/I4210108461","display_name":"Kingsoft (China)","ror":"https://ror.org/01stnfn33","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210108461"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jingxu Yang","raw_affiliation_strings":["Kingsoft Office"],"affiliations":[{"raw_affiliation_string":"Kingsoft Office","institution_ids":["https://openalex.org/I4210108461"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5028368874","display_name":"Yabo Sun","orcid":"https://orcid.org/0009-0005-0556-8513"},"institutions":[{"id":"https://openalex.org/I4210108461","display_name":"Kingsoft (China)","ror":"https://ror.org/01stnfn33","country_code":"CN","type":"company","lineage":["https://openalex.org/I4210108461"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yabo Sun","raw_affiliation_strings":["Kingsoft Office"],"affiliations":[{"raw_affiliation_string":"Kingsoft Office","institution_ids":["https://openalex.org/I4210108461"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100389396","display_name":"Yuliang Liu","orcid":"https://orcid.org/0000-0002-3037-173X"},"institutions":[{"id":"https://openalex.org/I47720641","display_name":"Huazhong University of Science and Technology","ror":"https://ror.org/00p991c53","country_code":"CN","type":"education","lineage":["https://openalex.org/I47720641"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Yuliang Liu","raw_affiliation_strings":["Huazhong University of Science and Technology"],"affiliations":[{"raw_affiliation_string":"Huazhong University of Science and Technology","institution_ids":["https://openalex.org/I47720641"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5101763725","display_name":"Xiang Bai","orcid":"https://orcid.org/0000-0002-8961-3916"},"institutions":[{"id":"https://openalex.org/I47720641","display_name":"Huazhong University of Science and Technology","ror":"https://ror.org/00p991c53","country_code":"CN","type":"education","lineage":["https://openalex.org/I47720641"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xiang Bai","raw_affiliation_strings":["Huazhong University of Science and Technology"],"affiliations":[{"raw_affiliation_string":"Huazhong University of Science and Technology","institution_ids":["https://openalex.org/I47720641"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":9,"corresponding_author_ids":["https://openalex.org/A5100418968"],"corresponding_institution_ids":["https://openalex.org/I47720641"],"apc_list":null,"apc_paid":null,"fwci":19.023,"has_fulltext":false,"cited_by_count":75,"citation_normalized_percentile":{"value":0.99660338,"is_in_top_1_percent":true,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":99,"max":100},"biblio":{"volume":null,"issue":null,"first_page":"26753","last_page":"26763"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10812","display_name":"Human Pose and Action Recognition","score":0.9937000274658203,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.9900000095367432,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6882392764091492},{"id":"https://openalex.org/keywords/modal","display_name":"Modal","score":0.597663164138794},{"id":"https://openalex.org/keywords/image","display_name":"Image (mathematics)","score":0.5616703629493713},{"id":"https://openalex.org/keywords/resolution","display_name":"Resolution (logic)","score":0.5187345147132874},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.49905896186828613},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.4756490886211395},{"id":"https://openalex.org/keywords/image-resolution","display_name":"Image resolution","score":0.4215225279331207}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6882392764091492},{"id":"https://openalex.org/C71139939","wikidata":"https://www.wikidata.org/wiki/Q910194","display_name":"Modal","level":2,"score":0.597663164138794},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.5616703629493713},{"id":"https://openalex.org/C138268822","wikidata":"https://www.wikidata.org/wiki/Q1051925","display_name":"Resolution (logic)","level":2,"score":0.5187345147132874},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.49905896186828613},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.4756490886211395},{"id":"https://openalex.org/C205372480","wikidata":"https://www.wikidata.org/wiki/Q210521","display_name":"Image resolution","level":2,"score":0.4215225279331207},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C188027245","wikidata":"https://www.wikidata.org/wiki/Q750446","display_name":"Polymer chemistry","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.1109/cvpr52733.2024.02527","is_oa":false,"landing_page_url":"https://doi.org/10.1109/cvpr52733.2024.02527","pdf_url":null,"source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"2024 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","raw_type":"proceedings-article"}],"best_oa_location":null,"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G5467188374","display_name":null,"funder_award_id":"62225603,62206104","funder_id":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China"}],"funders":[{"id":"https://openalex.org/F4320321001","display_name":"National Natural Science Foundation of China","ror":"https://ror.org/01h0zpd94"}],"has_content":{"pdf":false,"grobid_xml":false},"content_urls":null,"referenced_works_count":67,"referenced_works":["https://openalex.org/W1905882502","https://openalex.org/W2185175083","https://openalex.org/W2307512708","https://openalex.org/W2560730294","https://openalex.org/W2886641317","https://openalex.org/W2947312908","https://openalex.org/W2963518342","https://openalex.org/W2963622213","https://openalex.org/W2963899988","https://openalex.org/W2971822538","https://openalex.org/W2979382951","https://openalex.org/W2988326850","https://openalex.org/W3004268082","https://openalex.org/W3087858202","https://openalex.org/W3106859150","https://openalex.org/W3120043490","https://openalex.org/W3168867926","https://openalex.org/W3173585224","https://openalex.org/W3201693581","https://openalex.org/W4213213306","https://openalex.org/W4225323055","https://openalex.org/W4283208789","https://openalex.org/W4285255856","https://openalex.org/W4296605665","https://openalex.org/W4306820534","https://openalex.org/W4310629611","https://openalex.org/W4366330503","https://openalex.org/W4366850747","https://openalex.org/W4367367040","https://openalex.org/W4376653374","https://openalex.org/W4378942562","https://openalex.org/W4380994594","https://openalex.org/W4382142077","https://openalex.org/W4382490555","https://openalex.org/W4382491206","https://openalex.org/W4385645323","https://openalex.org/W4385970122","https://openalex.org/W4387436689","https://openalex.org/W4387688013","https://openalex.org/W4388513209","https://openalex.org/W4388555312","https://openalex.org/W4393149524","https://openalex.org/W6757817989","https://openalex.org/W6767970677","https://openalex.org/W6773842166","https://openalex.org/W6783441721","https://openalex.org/W6810334672","https://openalex.org/W6839091850","https://openalex.org/W6843405348","https://openalex.org/W6846007759","https://openalex.org/W6846178434","https://openalex.org/W6850503672","https://openalex.org/W6851578965","https://openalex.org/W6851950068","https://openalex.org/W6852060543","https://openalex.org/W6852609638","https://openalex.org/W6853116092","https://openalex.org/W6853330684","https://openalex.org/W6854222408","https://openalex.org/W6854231400","https://openalex.org/W6854262950","https://openalex.org/W6854347851","https://openalex.org/W6855297460","https://openalex.org/W6855815363","https://openalex.org/W6856808864","https://openalex.org/W6857162426","https://openalex.org/W6858147810"],"related_works":["https://openalex.org/W2362774332","https://openalex.org/W4249245269","https://openalex.org/W2025681766","https://openalex.org/W2765548132","https://openalex.org/W2159897444","https://openalex.org/W2542402767","https://openalex.org/W3023086044","https://openalex.org/W2142226356","https://openalex.org/W2056165575","https://openalex.org/W3210000161"],"abstract_inverted_index":{"Large":[0],"Multimodal":[1],"Models":[2],"(LMMs)":[3],"have":[4],"shown":[5],"promise":[6],"in":[7,47,116,145,158],"vision-language":[8],"tasks":[9,147],"but":[10],"struggle":[11],"with":[12,57,173],"high-resolution":[13],"input":[14,32],"and":[15,151],"detailed":[16,74,111],"scene":[17],"understanding.":[18],"Addressing":[19],"these":[20],"challenges,":[21],"we":[22],"introduce":[23],"Monkey":[24,30,63,141,167],"to":[25,69],"enhance":[26],"LMM":[27],"capabilities.":[28],"Firstly,":[29],"processes":[31],"images":[33],"by":[34],"dividing":[35],"them":[36],"into":[37],"uniform":[38],"patches,":[39],"each":[40,61],"matching":[41],"the":[42,48,52,73,89,104,119,128],"size":[43],"(e.g.,":[44],"448\u00d7448)":[45],"used":[46],"original":[49],"training":[50],"of":[51,76,113,121,130],"well-trained":[53],"vision":[54],"encoder.":[55],"Equipped":[56],"individual":[58],"adapter":[59],"for":[60,91,108],"patch,":[62],"can":[64],"handle":[65],"higher":[66,105],"resolutions":[67],"up":[68],"1344\u00d7896":[70],"pixels,":[71],"enabling":[72],"capture":[75,112],"complex":[77],"visual":[78],"information.":[79],"Secondly,":[80],"it":[81],"employs":[82],"a":[83,109],"multi-level":[84],"description":[85],"generation":[86],"method,":[87],"enriching":[88],"context":[90],"scene-object":[92],"associations.":[93],"This":[94],"two-part":[95],"strategy":[96],"ensures":[97],"more":[98,110],"effective":[99],"learning":[100],"from":[101],"generated":[102],"data:":[103],"resolution":[106],"allows":[107],"visuals,":[114],"which":[115],"turn":[117],"enhances":[118],"effectiveness":[120,129],"comprehensive":[122],"descriptions.":[123],"Extensive":[124],"ablative":[125],"results":[126,171],"validate":[127],"our":[131],"designs.":[132],"Additionally,":[133],"experiments":[134],"on":[135,162],"18":[136],"datasets":[137],"further":[138],"demonstrate":[139],"that":[140],"surpasses":[142],"existing":[143],"LMMs":[144],"many":[146],"like":[148],"Image":[149],"Captioning":[150],"various":[152],"Visual":[153],"Question":[154],"Answering":[155],"formats.":[156],"Specially,":[157],"qualitative":[159],"tests":[160],"focused":[161],"dense":[163],"text":[164],"question":[165],"answering,":[166],"has":[168],"exhibited":[169],"encouraging":[170],"compared":[172],"GPT4V.":[174],"Code":[175],"is":[176],"available":[177],"at":[178],"https://github.com/Yuliang-Liu/Monkey.":[179]},"counts_by_year":[{"year":2026,"cited_by_count":7},{"year":2025,"cited_by_count":46},{"year":2024,"cited_by_count":22}],"updated_date":"2026-03-20T23:20:44.827607","created_date":"2025-10-10T00:00:00"}
