{"id":"https://openalex.org/W4393213448","doi":"https://doi.org/10.48550/arxiv.2403.16558","title":"Elysium: Exploring Object-level Perception in Videos via MLLM","display_name":"Elysium: Exploring Object-level Perception in Videos via MLLM","publication_year":2024,"publication_date":"2024-03-25","ids":{"openalex":"https://openalex.org/W4393213448","doi":"https://doi.org/10.48550/arxiv.2403.16558"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2403.16558","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2403.16558","pdf_url":"https://arxiv.org/pdf/2403.16558","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2403.16558","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5022798483","display_name":"Han Wang","orcid":"https://orcid.org/0000-0002-5002-3708"},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Wang, Han","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100738098","display_name":"Yanjie Wang","orcid":"https://orcid.org/0000-0001-9332-7720"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Yanjie","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5103248206","display_name":"Yongjie Ye","orcid":"https://orcid.org/0009-0000-0093-7993"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ye, Yongjie","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5056540913","display_name":"Yuxiang Nie","orcid":"https://orcid.org/0009-0001-4197-1079"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Nie, Yuxiang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5101270190","display_name":"Can Huang","orcid":"https://orcid.org/0009-0000-2194-9643"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Can","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":5,"corresponding_author_ids":["https://openalex.org/A5022798483"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":1,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.6392999887466431,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.6392999887466431,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.571399986743927,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/perception","display_name":"Perception","score":0.71878582239151},{"id":"https://openalex.org/keywords/object","display_name":"Object (grammar)","score":0.6482454538345337},{"id":"https://openalex.org/keywords/computer-vision","display_name":"Computer vision","score":0.4169130325317383},{"id":"https://openalex.org/keywords/psychology","display_name":"Psychology","score":0.41645222902297974},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.38175806403160095},{"id":"https://openalex.org/keywords/cognitive-psychology","display_name":"Cognitive psychology","score":0.354113906621933},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3457202613353729},{"id":"https://openalex.org/keywords/geography","display_name":"Geography","score":0.3210674226284027},{"id":"https://openalex.org/keywords/neuroscience","display_name":"Neuroscience","score":0.045553624629974365}],"concepts":[{"id":"https://openalex.org/C26760741","wikidata":"https://www.wikidata.org/wiki/Q160402","display_name":"Perception","level":2,"score":0.71878582239151},{"id":"https://openalex.org/C2781238097","wikidata":"https://www.wikidata.org/wiki/Q175026","display_name":"Object (grammar)","level":2,"score":0.6482454538345337},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.4169130325317383},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.41645222902297974},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.38175806403160095},{"id":"https://openalex.org/C180747234","wikidata":"https://www.wikidata.org/wiki/Q23373","display_name":"Cognitive psychology","level":1,"score":0.354113906621933},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3457202613353729},{"id":"https://openalex.org/C205649164","wikidata":"https://www.wikidata.org/wiki/Q1071","display_name":"Geography","level":0,"score":0.3210674226284027},{"id":"https://openalex.org/C169760540","wikidata":"https://www.wikidata.org/wiki/Q207011","display_name":"Neuroscience","level":1,"score":0.045553624629974365}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2403.16558","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2403.16558","pdf_url":"https://arxiv.org/pdf/2403.16558","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},{"id":"doi:10.48550/arxiv.2403.16558","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2403.16558","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2403.16558","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2403.16558","pdf_url":"https://arxiv.org/pdf/2403.16558","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":false},"content_urls":{"pdf":"https://content.openalex.org/works/W4393213448.pdf"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W2058170566","https://openalex.org/W2755342338","https://openalex.org/W2772917594","https://openalex.org/W2775347418","https://openalex.org/W2166024367","https://openalex.org/W3116076068","https://openalex.org/W2229312674","https://openalex.org/W2951359407","https://openalex.org/W2079911747","https://openalex.org/W1969923398"],"abstract_inverted_index":{"Multi-modal":[0],"Large":[1,75],"Language":[2,76],"Models":[3,77],"(MLLMs)":[4],"have":[5],"demonstrated":[6],"their":[7,16],"ability":[8],"to":[9,34,47,53,143,166],"perceive":[10,54],"objects":[11,55],"in":[12,18,155,170],"still":[13],"images,":[14],"but":[15],"application":[17],"video-related":[19],"tasks,":[20],"such":[21],"as":[22],"object":[23,125],"tracking,":[24],"remains":[25],"understudied.":[26],"This":[27],"lack":[28],"of":[29,68,74,135],"exploration":[30],"is":[31,45,159],"primarily":[32],"due":[33],"two":[35],"key":[36],"challenges.":[37],"Firstly,":[38],"extensive":[39],"pretraining":[40],"on":[41],"large-scale":[42,94],"video":[43,95,121],"datasets":[44,183],"required":[46],"equip":[48],"MLLMs":[49,136],"with":[50,123],"the":[51,71,87,145],"capability":[52],"across":[56],"multiple":[57],"frames":[58,69,122],"and":[59,110,127,137,182],"understand":[60],"inter-frame":[61],"relationships.":[62],"Secondly,":[63],"processing":[64],"a":[65,81,93,139],"large":[66],"number":[67],"within":[70],"context":[72],"window":[73],"(LLMs)":[78],"can":[79],"impose":[80],"significant":[82],"computational":[83],"burden.":[84],"To":[85],"address":[86],"first":[88],"challenge,":[89],"we":[90,132],"introduce":[91],"ElysiumTrack-1M,":[92],"dataset":[96],"supported":[97],"for":[98],"three":[99],"tasks:":[100],"Single":[101,106],"Object":[102,107],"Tracking":[103,108],"(SOT),":[104],"Referring":[105,112],"(RSOT),":[109],"Video":[111],"Expression":[113],"Generation":[114],"(Video-REG).":[115],"ElysiumTrack-1M":[116],"contains":[117],"1.27":[118],"million":[119],"annotated":[120],"corresponding":[124],"boxes":[126],"descriptions.":[128],"Leveraging":[129],"this":[130],"dataset,":[131],"conduct":[133,167],"training":[134],"propose":[138],"token-compression":[140],"model":[141],"T-Selector":[142],"tackle":[144],"second":[146],"challenge.":[147],"Our":[148],"proposed":[149],"approach,":[150],"Elysium:":[151],"Exploring":[152],"Object-level":[153],"Perception":[154],"Videos":[156],"via":[157],"MLLM,":[158],"an":[160],"end-to-end":[161],"trainable":[162],"MLLM":[163],"that":[164],"attempts":[165],"object-level":[168],"tasks":[169],"videos":[171],"without":[172],"requiring":[173],"any":[174],"additional":[175],"plug-in":[176],"or":[177],"expert":[178],"models.":[179],"All":[180],"codes":[181],"are":[184],"available":[185],"at":[186],"https://github.com/Hon-Wong/Elysium.":[187]},"counts_by_year":[{"year":2025,"cited_by_count":1}],"updated_date":"2026-03-11T14:59:36.786465","created_date":"2025-10-10T00:00:00"}
