{"id":"https://openalex.org/W4415933221","doi":"https://doi.org/10.5753/jbcs.2025.5788","title":"Building High-Quality Datasets for Portuguese LLMs: From Common Crawl Snapshots to Industrial-Grade Corpora","display_name":"Building High-Quality Datasets for Portuguese LLMs: From Common Crawl Snapshots to Industrial-Grade Corpora","publication_year":2025,"publication_date":"2025-10-27","ids":{"openalex":"https://openalex.org/W4415933221","doi":"https://doi.org/10.5753/jbcs.2025.5788"},"language":null,"primary_location":{"id":"doi:10.5753/jbcs.2025.5788","is_oa":true,"landing_page_url":"https://doi.org/10.5753/jbcs.2025.5788","pdf_url":"https://journals-sol.sbc.org.br/index.php/jbcs/article/download/5788/3559","source":{"id":"https://openalex.org/S69801987","display_name":"Journal of the Brazilian Computer Society","issn_l":"0104-6500","issn":["0104-6500","1678-4804"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310319900","host_organization_name":"Springer Science+Business Media","host_organization_lineage":["https://openalex.org/P4310319900","https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Science+Business Media","Springer Nature"],"type":"journal"},"license":"cc-by-nc","license_id":"https://openalex.org/licenses/cc-by-nc","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Journal of the Brazilian Computer Society","raw_type":"journal-article"},"type":"article","indexed_in":["crossref","doaj"],"open_access":{"is_oa":true,"oa_status":"diamond","oa_url":"https://journals-sol.sbc.org.br/index.php/jbcs/article/download/5788/3559","any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5009573252","display_name":"Thales Sales Almeida","orcid":"https://orcid.org/0009-0006-9568-9331"},"institutions":[{"id":"https://openalex.org/I181391015","display_name":"Universidade Estadual de Campinas (UNICAMP)","ror":"https://ror.org/04wffgt70","country_code":"BR","type":"education","lineage":["https://openalex.org/I181391015"]}],"countries":["BR"],"is_corresponding":true,"raw_author_name":"Thales Sales Almeida","raw_affiliation_strings":["Institute of Computing, University of Campinas; Maritaca AI"],"affiliations":[{"raw_affiliation_string":"Institute of Computing, University of Campinas; Maritaca AI","institution_ids":["https://openalex.org/I181391015"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5030647281","display_name":"Rodrigo Nogueira","orcid":"https://orcid.org/0000-0002-2600-6035"},"institutions":[{"id":"https://openalex.org/I181391015","display_name":"Universidade Estadual de Campinas (UNICAMP)","ror":"https://ror.org/04wffgt70","country_code":"BR","type":"education","lineage":["https://openalex.org/I181391015"]}],"countries":["BR"],"is_corresponding":false,"raw_author_name":"Rodrigo Nogueira","raw_affiliation_strings":["School of Electrical and Computer Engineering, University of Campinas; Maritaca AI"],"affiliations":[{"raw_affiliation_string":"School of Electrical and Computer Engineering, University of Campinas; Maritaca AI","institution_ids":["https://openalex.org/I181391015"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5065725754","display_name":"H\u00e9lio Pedrini","orcid":"https://orcid.org/0000-0003-0125-630X"},"institutions":[{"id":"https://openalex.org/I181391015","display_name":"Universidade Estadual de Campinas (UNICAMP)","ror":"https://ror.org/04wffgt70","country_code":"BR","type":"education","lineage":["https://openalex.org/I181391015"]}],"countries":["BR"],"is_corresponding":false,"raw_author_name":"Helio Pedrini","raw_affiliation_strings":["Institute of Computing, University of Campinas"],"affiliations":[{"raw_affiliation_string":"Institute of Computing, University of Campinas","institution_ids":["https://openalex.org/I181391015"]}]}],"institutions":[],"countries_distinct_count":1,"institutions_distinct_count":3,"corresponding_author_ids":["https://openalex.org/A5009573252"],"corresponding_institution_ids":["https://openalex.org/I181391015"],"apc_list":null,"apc_paid":null,"fwci":4.9698,"has_fulltext":true,"cited_by_count":2,"citation_normalized_percentile":{"value":0.955529,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":91,"max":99},"biblio":{"volume":"31","issue":"1","first_page":"1247","last_page":"1263"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.3172000050544739,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.3172000050544739,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13629","display_name":"Text Readability and Simplification","score":0.24199999868869781,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13910","display_name":"Computational and Text Analysis Methods","score":0.0731000006198883,"subfield":{"id":"https://openalex.org/subfields/3300","display_name":"General Social Sciences"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/portuguese","display_name":"Portuguese","score":0.6753000020980835},{"id":"https://openalex.org/keywords/construct","display_name":"Construct (python library)","score":0.631600022315979},{"id":"https://openalex.org/keywords/preprocessor","display_name":"Preprocessor","score":0.6205000281333923},{"id":"https://openalex.org/keywords/scalability","display_name":"Scalability","score":0.5338000059127808},{"id":"https://openalex.org/keywords/selection","display_name":"Selection (genetic algorithm)","score":0.5200999975204468},{"id":"https://openalex.org/keywords/training-set","display_name":"Training set","score":0.4636000096797943},{"id":"https://openalex.org/keywords/language-model","display_name":"Language model","score":0.4325000047683716}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.84579998254776},{"id":"https://openalex.org/C35219183","wikidata":"https://www.wikidata.org/wiki/Q5146","display_name":"Portuguese","level":2,"score":0.6753000020980835},{"id":"https://openalex.org/C2780801425","wikidata":"https://www.wikidata.org/wiki/Q5164392","display_name":"Construct (python library)","level":2,"score":0.631600022315979},{"id":"https://openalex.org/C34736171","wikidata":"https://www.wikidata.org/wiki/Q918333","display_name":"Preprocessor","level":2,"score":0.6205000281333923},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5900999903678894},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.569599986076355},{"id":"https://openalex.org/C48044578","wikidata":"https://www.wikidata.org/wiki/Q727490","display_name":"Scalability","level":2,"score":0.5338000059127808},{"id":"https://openalex.org/C81917197","wikidata":"https://www.wikidata.org/wiki/Q628760","display_name":"Selection (genetic algorithm)","level":2,"score":0.5200999975204468},{"id":"https://openalex.org/C51632099","wikidata":"https://www.wikidata.org/wiki/Q3985153","display_name":"Training set","level":2,"score":0.4636000096797943},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.4325000047683716},{"id":"https://openalex.org/C10551718","wikidata":"https://www.wikidata.org/wiki/Q5227332","display_name":"Data pre-processing","level":2,"score":0.39820000529289246},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.39430001378059387},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.3912000060081482},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.34549999237060547},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.32510000467300415},{"id":"https://openalex.org/C2474386","wikidata":"https://www.wikidata.org/wiki/Q461183","display_name":"Text corpus","level":2,"score":0.2872999906539917},{"id":"https://openalex.org/C2776291640","wikidata":"https://www.wikidata.org/wiki/Q2912517","display_name":"Value (mathematics)","level":2,"score":0.26669999957084656},{"id":"https://openalex.org/C108583219","wikidata":"https://www.wikidata.org/wiki/Q197536","display_name":"Deep learning","level":2,"score":0.2597000002861023},{"id":"https://openalex.org/C18762648","wikidata":"https://www.wikidata.org/wiki/Q42213","display_name":"Work (physics)","level":2,"score":0.258899986743927}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.5753/jbcs.2025.5788","is_oa":true,"landing_page_url":"https://doi.org/10.5753/jbcs.2025.5788","pdf_url":"https://journals-sol.sbc.org.br/index.php/jbcs/article/download/5788/3559","source":{"id":"https://openalex.org/S69801987","display_name":"Journal of the Brazilian Computer Society","issn_l":"0104-6500","issn":["0104-6500","1678-4804"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310319900","host_organization_name":"Springer Science+Business Media","host_organization_lineage":["https://openalex.org/P4310319900","https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Science+Business Media","Springer Nature"],"type":"journal"},"license":"cc-by-nc","license_id":"https://openalex.org/licenses/cc-by-nc","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Journal of the Brazilian Computer Society","raw_type":"journal-article"}],"best_oa_location":{"id":"doi:10.5753/jbcs.2025.5788","is_oa":true,"landing_page_url":"https://doi.org/10.5753/jbcs.2025.5788","pdf_url":"https://journals-sol.sbc.org.br/index.php/jbcs/article/download/5788/3559","source":{"id":"https://openalex.org/S69801987","display_name":"Journal of the Brazilian Computer Society","issn_l":"0104-6500","issn":["0104-6500","1678-4804"],"is_oa":true,"is_in_doaj":true,"is_core":true,"host_organization":"https://openalex.org/P4310319900","host_organization_name":"Springer Science+Business Media","host_organization_lineage":["https://openalex.org/P4310319900","https://openalex.org/P4310319965"],"host_organization_lineage_names":["Springer Science+Business Media","Springer Nature"],"type":"journal"},"license":"cc-by-nc","license_id":"https://openalex.org/licenses/cc-by-nc","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Journal of the Brazilian Computer Society","raw_type":"journal-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4415933221.pdf","grobid_xml":"https://content.openalex.org/works/W4415933221.grobid-xml"},"referenced_works_count":27,"referenced_works":["https://openalex.org/W2120253242","https://openalex.org/W2251939518","https://openalex.org/W2905246312","https://openalex.org/W2946659172","https://openalex.org/W2991878188","https://openalex.org/W3008931407","https://openalex.org/W3034617741","https://openalex.org/W3045958725","https://openalex.org/W3096266342","https://openalex.org/W3137010024","https://openalex.org/W3169483174","https://openalex.org/W3177057043","https://openalex.org/W3177765786","https://openalex.org/W3194676777","https://openalex.org/W3213241618","https://openalex.org/W4284691825","https://openalex.org/W4385572438","https://openalex.org/W4387522367","https://openalex.org/W4387546783","https://openalex.org/W4389746748","https://openalex.org/W4391940656","https://openalex.org/W4392402185","https://openalex.org/W4399364305","https://openalex.org/W4402081998","https://openalex.org/W4402671353","https://openalex.org/W4408229604","https://openalex.org/W4412599294"],"related_works":[],"abstract_inverted_index":{"The":[0],"performance":[1,90,138],"of":[2,15,21,107,143],"large":[3],"language":[4,135],"models":[5],"(LLMs)":[6],"is":[7],"deeply":[8],"influenced":[9],"by":[10],"the":[11,22,105,133,141],"quality":[12],"and":[13,85,118],"composition":[14],"their":[16],"training":[17,39],"data.":[18,146],"While":[19,147],"much":[20],"existing":[23],"work":[24],"has":[25],"centered":[26],"on":[27,152],"English,":[28],"there":[29],"remains":[30],"a":[31,59,75,93,130],"gap":[32],"in":[33,64,97],"understanding":[34],"how":[35,81],"to":[36,57,70,99,132,137,158],"construct":[37],"effective":[38],"corpora":[40,51],"for":[41,48,52,113,163],"other":[42,159],"languages.":[43],"We":[44,54,126],"explore":[45],"scalable":[46],"methods":[47,155],"building":[49],"web-based":[50],"LLMs.":[53],"apply":[55],"them":[56],"build":[58],"new":[60],"120B":[61],"token":[62],"corpus":[63],"Portuguese":[65],"that":[66,128],"achieves":[67],"competitive":[68],"results":[69],"an":[71],"industrial-grade":[72],"corpus.":[73],"Using":[74],"continual":[76],"pretraining":[77],"setup,":[78],"we":[79],"study":[80,150],"different":[82],"data":[83],"selection":[84],"preprocessing":[86],"strategies":[87],"affect":[88],"LLM":[89,165],"when":[91],"transitioning":[92],"model":[94,131],"originally":[95],"trained":[96],"English":[98],"another":[100],"language.":[101],"Our":[102],"findings":[103],"demonstrate":[104],"value":[106],"language-specific":[108,145],"filtering":[109],"pipelines,":[110],"including":[111],"classifiers":[112],"education,":[114],"science,":[115],"technology,":[116],"engineering,":[117],"mathematics":[119],"(STEM),":[120],"as":[121,123],"well":[122],"toxic":[124],"content.":[125],"show":[127],"adapting":[129],"target":[134],"leads":[136],"improvements,":[139],"reinforcing":[140],"importance":[142],"high-quality,":[144],"our":[148,154],"case":[149],"focuses":[151],"Portuguese,":[153],"are":[156],"applicable":[157],"languages,":[160],"offering":[161],"insights":[162],"multilingual":[164],"development.":[166]},"counts_by_year":[{"year":2026,"cited_by_count":1},{"year":2025,"cited_by_count":1}],"updated_date":"2026-03-27T05:58:40.876381","created_date":"2025-11-05T00:00:00"}
