{"id":"https://openalex.org/W4385714663","doi":"https://doi.org/10.48550/arxiv.2308.04014","title":"Continual Pre-Training of Large Language Models: How to (re)warm your model?","display_name":"Continual Pre-Training of Large Language Models: How to (re)warm your model?","publication_year":2023,"publication_date":"2023-08-08","ids":{"openalex":"https://openalex.org/W4385714663","doi":"https://doi.org/10.48550/arxiv.2308.04014"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2308.04014","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2308.04014","pdf_url":"https://arxiv.org/pdf/2308.04014","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2308.04014","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5038610806","display_name":"Kshitij Gupta","orcid":null},"institutions":[],"countries":[],"is_corresponding":true,"raw_author_name":"Gupta, Kshitij","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5051030091","display_name":"Benjamin Th\u00e9rien","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Th\u00e9rien, Benjamin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113231932","display_name":"Adam Ibrahim","orcid":"https://orcid.org/0009-0000-2833-4940"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ibrahim, Adam","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5108024956","display_name":"Mats L. Richter","orcid":"https://orcid.org/0000-0002-0991-3047"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Richter, Mats L.","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5015153122","display_name":"Quentin Anthony","orcid":"https://orcid.org/0000-0002-6823-9080"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Anthony, Quentin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5025113992","display_name":"Eugene Belilovsky","orcid":"https://orcid.org/0000-0002-2986-596X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Belilovsky, Eugene","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5055430458","display_name":"Irina Rish","orcid":"https://orcid.org/0000-0001-6856-5057"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Rish, Irina","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5006026661","display_name":"Timoth\u00e9e Lesort","orcid":"https://orcid.org/0000-0002-8669-0764"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lesort, Timoth\u00e9e","raw_affiliation_strings":[],"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":8,"corresponding_author_ids":["https://openalex.org/A5038610806"],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":12,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9990000128746033,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9990000128746033,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9954000115394592,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9815000295639038,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/perplexity","display_name":"Perplexity","score":0.918185830116272},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.8003714084625244},{"id":"https://openalex.org/keywords/downstream","display_name":"Downstream (manufacturing)","score":0.6330181956291199},{"id":"https://openalex.org/keywords/training","display_name":"Training (meteorology)","score":0.6058921813964844},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.5965522527694702},{"id":"https://openalex.org/keywords/upstream","display_name":"Upstream (networking)","score":0.5924952626228333},{"id":"https://openalex.org/keywords/schedule","display_name":"Schedule","score":0.4434964060783386},{"id":"https://openalex.org/keywords/process","display_name":"Process (computing)","score":0.42424821853637695},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3961174488067627},{"id":"https://openalex.org/keywords/machine-learning","display_name":"Machine learning","score":0.35001254081726074},{"id":"https://openalex.org/keywords/programming-language","display_name":"Programming language","score":0.09465506672859192}],"concepts":[{"id":"https://openalex.org/C100279451","wikidata":"https://www.wikidata.org/wiki/Q372193","display_name":"Perplexity","level":3,"score":0.918185830116272},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8003714084625244},{"id":"https://openalex.org/C2776207758","wikidata":"https://www.wikidata.org/wiki/Q5303302","display_name":"Downstream (manufacturing)","level":2,"score":0.6330181956291199},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.6058921813964844},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.5965522527694702},{"id":"https://openalex.org/C191172861","wikidata":"https://www.wikidata.org/wiki/Q7899321","display_name":"Upstream (networking)","level":2,"score":0.5924952626228333},{"id":"https://openalex.org/C68387754","wikidata":"https://www.wikidata.org/wiki/Q7271585","display_name":"Schedule","level":2,"score":0.4434964060783386},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.42424821853637695},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3961174488067627},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.35001254081726074},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.09465506672859192},{"id":"https://openalex.org/C21547014","wikidata":"https://www.wikidata.org/wiki/Q1423657","display_name":"Operations management","level":1,"score":0.0},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.0},{"id":"https://openalex.org/C153294291","wikidata":"https://www.wikidata.org/wiki/Q25261","display_name":"Meteorology","level":1,"score":0.0},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2308.04014","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2308.04014","pdf_url":"https://arxiv.org/pdf/2308.04014","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},{"id":"doi:10.48550/arxiv.2308.04014","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2308.04014","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"article"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2308.04014","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2308.04014","pdf_url":"https://arxiv.org/pdf/2308.04014","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"sustainable_development_goals":[],"awards":[{"id":"https://openalex.org/G3033334534","display_name":null,"funder_award_id":"AI Chair","funder_id":"https://openalex.org/F4320309949","funder_display_name":"Canadian Institute for Advanced Research"},{"id":"https://openalex.org/G8744305248","display_name":null,"funder_award_id":"Chair","funder_id":"https://openalex.org/F4320321481","funder_display_name":"Canada Excellence Research Chairs, Government of Canada"}],"funders":[{"id":"https://openalex.org/F4320309949","display_name":"Canadian Institute for Advanced Research","ror":"https://ror.org/01sdtdd95"},{"id":"https://openalex.org/F4320321481","display_name":"Canada Excellence Research Chairs, Government of Canada","ror":"https://ror.org/02tvrwm90"},{"id":"https://openalex.org/F4320323175","display_name":"Universit\u00e9 de Montr\u00e9al","ror":"https://ror.org/0161xgx34"},{"id":"https://openalex.org/F4320334841","display_name":"Fonds de recherche du Qu\u00e9bec \u2013 Nature et technologies","ror":"https://ror.org/00b9f9778"}],"has_content":{"pdf":true,"grobid_xml":false},"content_urls":{"pdf":"https://content.openalex.org/works/W4385714663.pdf"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W2169518243","https://openalex.org/W2252095989","https://openalex.org/W3113528484","https://openalex.org/W4322096525","https://openalex.org/W2551914602","https://openalex.org/W2241146542","https://openalex.org/W4281893144","https://openalex.org/W2105076537","https://openalex.org/W2787311093","https://openalex.org/W3124058258"],"abstract_inverted_index":{"Large":[0],"language":[1,149],"models":[2,43,113,178,198],"(LLMs)":[3],"are":[4],"routinely":[5],"pre-trained":[6,42,114],"on":[7,66,103,115,127,145,183],"billions":[8],"of":[9,37,48,83,112],"tokens,":[10],"only":[11],"to":[12,32,97,125],"restart":[13],"the":[14,34,54,81,91,109,116,146,181,189,194],"process":[15],"over":[16],"again":[17],"once":[18],"new":[19,45,105],"data":[20,46,60],"becomes":[21],"available.":[22],"A":[23],"much":[24],"cheaper":[25],"and":[26,137,152,168,185],"more":[27],"efficient":[28,73],"solution":[29],"would":[30],"be":[31,95],"enable":[33],"continual":[35,74],"pre-training":[36,162],"these":[38],"models,":[39],"i.e.":[40],"updating":[41],"with":[44,160],"instead":[47],"re-training":[49],"them":[50],"from":[51,200],"scratch.":[52],"However,":[53],"distribution":[55],"shift":[56],"induced":[57],"by":[58],"novel":[59],"typically":[61],"results":[62,173],"in":[63,76,188],"degraded":[64],"performance":[65,154],"past":[67],"data.":[68],"Taking":[69],"a":[70,104,134,203],"step":[71],"towards":[72],"pre-training,":[75],"this":[77],"work,":[78],"we":[79,123],"examine":[80],"effect":[82],"different":[84,161],"warm-up":[85],"strategies.":[86],"Our":[87,172],"hypothesis":[88],"is":[89],"that":[90,175],"learning":[92,166],"rate":[93],"must":[94],"re-increased":[96],"improve":[98],"compute":[99],"efficiency":[100],"when":[101],"training":[102],"dataset.":[106,206],"We":[107,141,158],"study":[108],"warmup":[110,136,170],"phase":[111],"Pile":[117],"(upstream":[118],"data,":[119,130,187],"300B":[120],"tokens)":[121],"as":[122],"continue":[124],"pre-train":[126],"SlimPajama":[128],"(downstream":[129],"297B":[131],"tokens),":[132],"following":[133],"linear":[135],"cosine":[138],"decay":[139],"schedule.":[140],"conduct":[142],"all":[143],"experiments":[144],"Pythia":[147],"410M":[148],"model":[150],"architecture":[151],"evaluate":[153],"through":[155],"validation":[156],"perplexity.":[157],"experiment":[159],"checkpoints,":[163],"various":[164,169],"maximum":[165],"rates,":[167],"lengths.":[171],"show":[174],"while":[176],"rewarming":[177],"first":[179],"increases":[180],"loss":[182],"upstream":[184],"downstream":[186,195,205],"longer":[190],"run":[191],"it":[192],"improves":[193],"performance,":[196],"outperforming":[197],"trained":[199],"scratch$\\unicode{x2013}$even":[201],"for":[202],"large":[204]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":7},{"year":2024,"cited_by_count":3},{"year":2023,"cited_by_count":1}],"updated_date":"2026-03-29T08:15:47.926485","created_date":"2023-08-10T00:00:00"}
