tag. We found the following text: ' + text); + const wrapper = document.createElement('span'); + wrapper.innerHTML = addedNode.nodeValue; + addedNode.parentNode.insertBefore(wrapper, addedNode); + addedNode.parentNode.removeChild(addedNode); + } + } break; + } + } + } + }).observe(this, {childList: true}); + } + + } + + var commonjsGlobal = typeof globalThis !== 'undefined' ? globalThis : typeof window !== 'undefined' ? window : typeof global !== 'undefined' ? global : typeof self !== 'undefined' ? self : {}; + + function createCommonjsModule(fn, module) { + return module = { exports: {} }, fn(module, module.exports), module.exports; + } + + var bibtexParse = createCommonjsModule(function (module, exports) { + /* start bibtexParse 0.0.22 */ + + //Original work by Henrik Muehe (c) 2010 + // + //CommonJS port by Mikola Lysenko 2013 + // + //Port to Browser lib by ORCID / RCPETERS + // + //Issues: + //no comment handling within strings + //no string concatenation + //no variable values yet + //Grammar implemented here: + //bibtex -> (string | preamble | comment | entry)*; + //string -> '@STRING' '{' key_equals_value '}'; + //preamble -> '@PREAMBLE' '{' value '}'; + //comment -> '@COMMENT' '{' value '}'; + //entry -> '@' key '{' key ',' key_value_list '}'; + //key_value_list -> key_equals_value (',' key_equals_value)*; + //key_equals_value -> key '=' value; + //value -> value_quotes | value_braces | key; + //value_quotes -> '"' .*? '"'; // not quite + //value_braces -> '{' .*? '"'; // not quite + (function(exports) { + + function BibtexParser() { + + this.months = ["jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sep", "oct", "nov", "dec"]; + this.notKey = [',','{','}',' ','=']; + this.pos = 0; + this.input = ""; + this.entries = new Array(); + + this.currentEntry = ""; + + this.setInput = function(t) { + this.input = t; + }; + + this.getEntries = function() { + return this.entries; + }; + + this.isWhitespace = function(s) { + return (s == ' ' || s == '\r' || s == '\t' || s == '\n'); + }; + + this.match = function(s, canCommentOut) { + if (canCommentOut == undefined || canCommentOut == null) + canCommentOut = true; + this.skipWhitespace(canCommentOut); + if (this.input.substring(this.pos, this.pos + s.length) == s) { + this.pos += s.length; + } else { + throw "Token mismatch, expected " + s + ", found " + + this.input.substring(this.pos); + } this.skipWhitespace(canCommentOut); + }; + + this.tryMatch = function(s, canCommentOut) { + if (canCommentOut == undefined || canCommentOut == null) + canCommentOut = true; + this.skipWhitespace(canCommentOut); + if (this.input.substring(this.pos, this.pos + s.length) == s) { + return true; + } else { + return false; + } }; + + /* when search for a match all text can be ignored, not just white space */ + this.matchAt = function() { + while (this.input.length > this.pos && this.input[this.pos] != '@') { + this.pos++; + } + if (this.input[this.pos] == '@') { + return true; + } return false; + }; + + this.skipWhitespace = function(canCommentOut) { + while (this.isWhitespace(this.input[this.pos])) { + this.pos++; + } if (this.input[this.pos] == "%" && canCommentOut == true) { + while (this.input[this.pos] != "\n") { + this.pos++; + } this.skipWhitespace(canCommentOut); + } }; + + this.value_braces = function() { + var bracecount = 0; + this.match("{", false); + var start = this.pos; + var escaped = false; + while (true) { + if (!escaped) { + if (this.input[this.pos] == '}') { + if (bracecount > 0) { + bracecount--; + } else { + var end = this.pos; + this.match("}", false); + return this.input.substring(start, end); + } } else if (this.input[this.pos] == '{') { + bracecount++; + } else if (this.pos >= this.input.length - 1) { + throw "Unterminated value"; + } } if (this.input[this.pos] == '\\' && escaped == false) + escaped = true; + else + escaped = false; + this.pos++; + } }; + + this.value_comment = function() { + var str = ''; + var brcktCnt = 0; + while (!(this.tryMatch("}", false) && brcktCnt == 0)) { + str = str + this.input[this.pos]; + if (this.input[this.pos] == '{') + brcktCnt++; + if (this.input[this.pos] == '}') + brcktCnt--; + if (this.pos >= this.input.length - 1) { + throw "Unterminated value:" + this.input.substring(start); + } this.pos++; + } return str; + }; + + this.value_quotes = function() { + this.match('"', false); + var start = this.pos; + var escaped = false; + while (true) { + if (!escaped) { + if (this.input[this.pos] == '"') { + var end = this.pos; + this.match('"', false); + return this.input.substring(start, end); + } else if (this.pos >= this.input.length - 1) { + throw "Unterminated value:" + this.input.substring(start); + } } + if (this.input[this.pos] == '\\' && escaped == false) + escaped = true; + else + escaped = false; + this.pos++; + } }; + + this.single_value = function() { + var start = this.pos; + if (this.tryMatch("{")) { + return this.value_braces(); + } else if (this.tryMatch('"')) { + return this.value_quotes(); + } else { + var k = this.key(); + if (k.match("^[0-9]+$")) + return k; + else if (this.months.indexOf(k.toLowerCase()) >= 0) + return k.toLowerCase(); + else + throw "Value expected:" + this.input.substring(start) + ' for key: ' + k; + + } }; + + this.value = function() { + var values = []; + values.push(this.single_value()); + while (this.tryMatch("#")) { + this.match("#"); + values.push(this.single_value()); + } return values.join(""); + }; + + this.key = function() { + var start = this.pos; + while (true) { + if (this.pos >= this.input.length) { + throw "Runaway key"; + } // а-яА-Я is Cyrillic + //console.log(this.input[this.pos]); + if (this.notKey.indexOf(this.input[this.pos]) >= 0) { + return this.input.substring(start, this.pos); + } else { + this.pos++; + + } } }; + + this.key_equals_value = function() { + var key = this.key(); + if (this.tryMatch("=")) { + this.match("="); + var val = this.value(); + return [ key, val ]; + } else { + throw "... = value expected, equals sign missing:" + + this.input.substring(this.pos); + } }; + + this.key_value_list = function() { + var kv = this.key_equals_value(); + this.currentEntry['entryTags'] = {}; + this.currentEntry['entryTags'][kv[0]] = kv[1]; + while (this.tryMatch(",")) { + this.match(","); + // fixes problems with commas at the end of a list + if (this.tryMatch("}")) { + break; + } + kv = this.key_equals_value(); + this.currentEntry['entryTags'][kv[0]] = kv[1]; + } }; + + this.entry_body = function(d) { + this.currentEntry = {}; + this.currentEntry['citationKey'] = this.key(); + this.currentEntry['entryType'] = d.substring(1); + this.match(","); + this.key_value_list(); + this.entries.push(this.currentEntry); + }; + + this.directive = function() { + this.match("@"); + return "@" + this.key(); + }; + + this.preamble = function() { + this.currentEntry = {}; + this.currentEntry['entryType'] = 'PREAMBLE'; + this.currentEntry['entry'] = this.value_comment(); + this.entries.push(this.currentEntry); + }; + + this.comment = function() { + this.currentEntry = {}; + this.currentEntry['entryType'] = 'COMMENT'; + this.currentEntry['entry'] = this.value_comment(); + this.entries.push(this.currentEntry); + }; + + this.entry = function(d) { + this.entry_body(d); + }; + + this.bibtex = function() { + while (this.matchAt()) { + var d = this.directive(); + this.match("{"); + if (d == "@STRING") { + this.string(); + } else if (d == "@PREAMBLE") { + this.preamble(); + } else if (d == "@COMMENT") { + this.comment(); + } else { + this.entry(d); + } + this.match("}"); + } }; + } + exports.toJSON = function(bibtex) { + var b = new BibtexParser(); + b.setInput(bibtex); + b.bibtex(); + return b.entries; + }; + + /* added during hackathon don't hate on me */ + exports.toBibtex = function(json) { + var out = ''; + for ( var i in json) { + out += "@" + json[i].entryType; + out += '{'; + if (json[i].citationKey) + out += json[i].citationKey + ', '; + if (json[i].entry) + out += json[i].entry ; + if (json[i].entryTags) { + var tags = ''; + for (var jdx in json[i].entryTags) { + if (tags.length != 0) + tags += ', '; + tags += jdx + '= {' + json[i].entryTags[jdx] + '}'; + } + out += tags; + } + out += '}\n\n'; + } + return out; + + }; + + })( exports); + + /* end bibtexParse */ + }); + + // Copyright 2018 The Distill Template Authors + + function normalizeTag(string) { + return string + .replace(/[\t\n ]+/g, ' ') + .replace(/{\\["^`.'acu~Hvs]( )?([a-zA-Z])}/g, (full, x, char) => char) + .replace(/{\\([a-zA-Z])}/g, (full, char) => char); + } + + function parseBibtex(bibtex) { + const bibliography = new Map(); + const parsedEntries = bibtexParse.toJSON(bibtex); + for (const entry of parsedEntries) { + // normalize tags; note entryTags is an object, not Map + for (const [key, value] of Object.entries(entry.entryTags)) { + entry.entryTags[key.toLowerCase()] = normalizeTag(value); + } + entry.entryTags.type = entry.entryType; + // add to bibliography + bibliography.set(entry.citationKey, entry.entryTags); + } + return bibliography; + } + + function serializeFrontmatterToBibtex(frontMatter) { + return `@article{${frontMatter.slug}, + author = {${frontMatter.bibtexAuthors}}, + title = {${frontMatter.title}}, + journal = {${frontMatter.journal.title}}, + year = {${frontMatter.publishedYear}}, + note = {${frontMatter.url}}, + doi = {${frontMatter.doi}} +}`; + } + + // Copyright 2018 The Distill Template Authors + + class Bibliography extends HTMLElement { + + static get is() { return 'd-bibliography'; } + + constructor() { + super(); + + // set up mutation observer + const options = {childList: true, characterData: true, subtree: true}; + const observer = new MutationObserver( (entries) => { + for (const entry of entries) { + if (entry.target.nodeName === 'SCRIPT' || entry.type === 'characterData') { + this.parseIfPossible(); + } + } + }); + observer.observe(this, options); + } + + connectedCallback() { + requestAnimationFrame(() => { + this.parseIfPossible(); + }); + } + + parseIfPossible() { + const scriptTag = this.querySelector('script'); + if (!scriptTag) return; + if (scriptTag.type == 'text/bibtex') { + const newBibtex = scriptTag.textContent; + if (this.bibtex !== newBibtex) { + this.bibtex = newBibtex; + const bibliography = parseBibtex(this.bibtex); + this.notify(bibliography); + } + } else if (scriptTag.type == 'text/json') { + const bibliography = new Map(JSON.parse(scriptTag.textContent)); + this.notify(bibliography); + } else { + console.warn('Unsupported bibliography script tag type: ' + scriptTag.type); + } + } + + notify(bibliography) { + const options = { detail: bibliography, bubbles: true }; + const event = new CustomEvent('onBibliographyChanged', options); + this.dispatchEvent(event); + } + + /* observe 'src' attribute */ + + static get observedAttributes() { + return ['src']; + } + + receivedBibtex(event) { + const bibliography = parseBibtex(event.target.response); + this.notify(bibliography); + } + + attributeChangedCallback(name, oldValue, newValue) { + var oReq = new XMLHttpRequest(); + oReq.onload = (e) => this.receivedBibtex(e); + oReq.onerror = () => console.warn(`Could not load Bibtex! (tried ${newValue})`); + oReq.responseType = 'text'; + oReq.open('GET', newValue, true); + oReq.send(); + } + + + } + + // Copyright 2018 The Distill Template Authors + // + // Licensed under the Apache License, Version 2.0 (the "License"); + // you may not use this file except in compliance with the License. + // You may obtain a copy of the License at + // + // http://www.apache.org/licenses/LICENSE-2.0 + // + // Unless required by applicable law or agreed to in writing, software + // distributed under the License is distributed on an "AS IS" BASIS, + // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + // See the License for the specific language governing permissions and + // limitations under the License. + + // import style from '../styles/d-byline.css'; + + function bylineTemplate(frontMatter) { + return ` +
+`; + } + + class Byline extends HTMLElement { + + static get is() { return 'd-byline'; } + + set frontMatter(frontMatter) { + this.innerHTML = bylineTemplate(frontMatter); + } + + } + + // Copyright 2018 The Distill Template Authors + + const T$3 = Template( + "d-cite", + ` + + +
+
+`);
+
+ class Code extends Mutating(T$4(HTMLElement)) {
+
+ renderContent() {
+
+ // check if language can be highlighted
+ this.languageName = this.getAttribute('language');
+ if (!this.languageName) {
+ console.warn('You need to provide a language attribute to your `; + if (frontMatter.githubCompareUpdatesUrl) { + html += `View all changes to this article since it was first published.`; + } + html += ` + If you see mistakes or want to suggest changes, please create an issue on GitHub.
+ `; + } + + const journal = frontMatter.journal; + if (typeof journal !== 'undefined' && journal.title === 'Distill') { + html += ` +Diagrams and text are licensed under Creative Commons Attribution CC-BY 4.0 with the source available on GitHub, unless noted otherwise. The figures that have been reused from other sources don’t fall under this license and can be recognized by a note in their caption: “Figure from …”.
+ `; + } + + if (typeof frontMatter.publishedDate !== 'undefined') { + html += ` +For attribution in academic contexts, please cite this work as
+${frontMatter.concatenatedAuthors}, "${frontMatter.title}", Distill, ${frontMatter.publishedYear}.+
BibTeX citation
+${serializeFrontmatterToBibtex(frontMatter)}+ `; + } + + return html; + } + + class DistillAppendix extends HTMLElement { + + static get is() { return 'distill-appendix'; } + + set frontMatter(frontMatter) { + this.innerHTML = appendixTemplate(frontMatter); + } + + } + + const footerTemplate = ` + + + + +`; + + // Copyright 2018 The Distill Template Authors + + const T$c = Template('distill-footer', footerTemplate); + + class DistillFooter extends T$c(HTMLElement) { + + } + + // Copyright 2018 The Distill Template Authors + + let templateIsLoading = false; + let runlevel = 0; + const initialize = function() { + if (window.distill.runlevel < 1) { + throw new Error("Insufficient Runlevel for Distill Template!"); + } + + /* 1. Flag that we're being loaded */ + if ("distill" in window && window.distill.templateIsLoading) { + throw new Error( + "Runlevel 1: Distill Template is getting loaded more than once, aborting!" + ); + } else { + window.distill.templateIsLoading = true; + console.debug("Runlevel 1: Distill Template has started loading."); + } + + /* 2. Add styles if they weren't added during prerendering */ + makeStyleTag(document); + console.debug("Runlevel 1: Static Distill styles have been added."); + console.debug("Runlevel 1->2."); + window.distill.runlevel += 1; + + /* 3. Register Controller listener functions */ + /* Needs to happen before components to their connected callbacks have a controller to talk to. */ + for (const [functionName, callback] of Object.entries(Controller.listeners)) { + if (typeof callback === "function") { + document.addEventListener(functionName, callback); + } else { + console.error("Runlevel 2: Controller listeners need to be functions!"); + } + } + console.debug("Runlevel 2: We can now listen to controller events."); + console.debug("Runlevel 2->3."); + window.distill.runlevel += 1; + + /* 4. Register components */ + const components = [ + Abstract, Appendix, Article, Bibliography, Byline, Cite, CitationList, Code, + Footnote, FootnoteList, FrontMatter$1, HoverBox, Title, DMath, References, TOC, Figure, + Slider, Interstitial + ]; + + const distillComponents = [DistillHeader, DistillAppendix, DistillFooter]; + + if (window.distill.runlevel < 2) { + throw new Error("Insufficient Runlevel for adding custom elements!"); + } + const allComponents = components.concat(distillComponents); + for (const component of allComponents) { + console.debug("Runlevel 2: Registering custom element: " + component.is); + customElements.define(component.is, component); + } + + console.debug( + "Runlevel 3: Distill Template finished registering custom elements." + ); + console.debug("Runlevel 3->4."); + window.distill.runlevel += 1; + + // If template was added after DOMContentLoaded we may have missed that event. + // Controller will check for that case, so trigger the event explicitly: + if (domContentLoaded()) { + Controller.listeners.DOMContentLoaded(); + } + + console.debug("Runlevel 4: Distill Template initialisation complete."); + window.distill.templateIsLoading = false; + window.distill.templateHasLoaded = true; + }; + + window.distill = { runlevel, initialize, templateIsLoading }; + + /* 0. Check browser feature support; synchronously polyfill if needed */ + if (Polyfills.browserSupportsAllFeatures()) { + console.debug("Runlevel 0: No need for polyfills."); + console.debug("Runlevel 0->1."); + window.distill.runlevel += 1; + window.distill.initialize(); + } else { + console.debug("Runlevel 0: Distill Template is loading polyfills."); + Polyfills.load(window.distill.initialize); + } + +}))); +//# sourceMappingURL=template.v2.js.map diff --git a/app/src/fine_tasks.js b/app/src/fine_tasks.js new file mode 100644 index 0000000000000000000000000000000000000000..734beee10de9f0288ae962acbab3962beee04040 --- /dev/null +++ b/app/src/fine_tasks.js @@ -0,0 +1,245 @@ +import Papa from 'papaparse'; +import { DataTable } from 'simple-datatables'; + +const languageMap = { + 'Arabic': 'ar', + 'Turkish': 'tr', + 'Swahili': 'sw', + 'Russian': 'ru', + 'Telugu': 'te', + 'Thai': 'th', + 'Chinese': 'zh', + 'French': 'fr', + 'Hindi': 'hi', +}; + +const metricTypes = [ + { value: 'max_score', label: 'Max Score' }, + { value: 'avg_snr', label: 'Low Noise' }, + { value: 'avg_spearman', label: 'Monotonicity' }, + { value: 'max_n_std', label: 'Non-Randomness' }, + { value: 'avg_kendall_tau_a', label: 'Ordering Consistency' } +]; + +const tableTypes = [ + { value: 'gen', label: 'Generative' }, + { value: 'mc', label: 'Multichoice' } +]; + +const taskFolders = [ + { value: 'selected', label: 'FineTasks' }, + { value: 'non_selected', label: 'Non-Selected' } +]; + +function createDropdown(options, onChange) { + const select = document.createElement('select'); + options.forEach(option => { + const optionElement = document.createElement('option'); + if (typeof option === 'object' && option.value && option.label) { + optionElement.value = option.value; + optionElement.textContent = option.label; + } else { + optionElement.value = option; + optionElement.textContent = option; + } + select.appendChild(optionElement); + }); + select.addEventListener('change', onChange); + return select; +} + +function createPerTaskResultsTable(data, tableType, metric) { + const tableWrapper = document.createElement('div'); + tableWrapper.className = 'table-wrapper fine-tasks-table-wrapper'; + + const table = document.createElement('table'); + table.className = 'results-table fine-tasks-results-table'; + + const columns = ['Task', 'Type', ...(tableType === 'gen' ? ['f1', 'prefix_match'] : ['acc', 'acc_norm', 'acc_norm_token', 'acc_norm_pmi'])]; + + const columnNameMap = { + // 'Task': 'Task', + // 'Type': 'Type', + // 'f1': 'f1', + // 'prefix_match': 'prefix_match', + // 'acc': 'acc', + 'acc_norm': 'acc_char', + 'acc_norm_token': 'acc_token', + 'acc_norm_pmi': 'acc_pmi', + 'prefix_match': 'prefix' + }; + + const taskMetricMap = { + 'max_score': 'score', + 'avg_snr': 'snr', + 'avg_spearman': 'monotonicity', + 'max_n_std': 'non-randomness', + 'avg_kendall_tau_a': 'ordering' + // 'avg_spearman': 'monotonicity', + } + + const header = table.createTHead(); + const headerRow = header.insertRow(); + columns.forEach(column => { + const th = document.createElement('th'); + th.textContent = columnNameMap[column] || column; + + if (th.textContent !== "Task" && th.textContent !== "Type") { + th.textContent += " " + (taskMetricMap[metric] || metric); + } + th.title = th.textContent; + if (column === 'Type') + th.style.width = '40px'; + headerRow.appendChild(th); + }); + + const body = table.createTBody(); + data.forEach(row => { + if (Object.values(row).every(value => value === '' || value === undefined || value === null)) { + return; + } + + const tr = body.insertRow(); + columns.forEach(column => { + const td = tr.insertCell(); + let value = row[column]; + if (column === 'Task') { + const fullTaskName = value; // Store the full task name + const parts = value.split('|'); + value = parts.length > 1 ? parts[1] : value; + value = value.split('_mcf')[0].split('_cf')[0]; + td.title = fullTaskName; // Set the title attribute to show the full name on hover + } else if (column === 'Type') { + // Keep the task type as is + } else if (typeof value === 'number') { + value = value.toFixed(2); + } else if (value && !isNaN(parseFloat(value))) { + value = parseFloat(value).toFixed(2); + } else { + value = ''; + } + td.textContent = value; + }); + }); + + tableWrapper.appendChild(table); + return tableWrapper; +} + +export function initFineTasks(containerId) { + const container = document.getElementById(containerId); + if (!container) return; + + const perTaskTitleElement = document.createElement('h3'); + perTaskTitleElement.textContent = 'Task Results'; + perTaskTitleElement.className = 'fine-tasks-title'; + + const perTaskTableContainer = document.createElement('div'); + perTaskTableContainer.className = 'table-container'; + + let perTaskDataTable; + + function updatePerTaskResults() { + const language = languageDropdownPerTask.value; + const metric = metricDropdownPerTask.value; + const tableType = tableTypeDropdownPerTask.value; + const taskFolder = taskFolderDropdownPerTask.value; + + const languageCode = languageMap[language]; + + if (!languageCode) { + console.error(`Language code not found for ${language}`); + perTaskTableContainer.innerHTML = `
Error: Language code not found for ${language}
`; + return; + } + + let url = `data/tasks/${taskFolder}/${languageCode}/${metric}/${tableType}_stats.csv`; + + fetch(url) + .then(response => { + if (!response.ok) { + throw new Error(`HTTP error! status: ${response.status}`); + } + return response.text(); + }) + .then(csvText => { + const results = Papa.parse(csvText, { header: true }).data; + perTaskTableContainer.innerHTML = ''; + const tableWrapper = createPerTaskResultsTable(results, tableType, metric); + perTaskTableContainer.appendChild(tableWrapper); + + if (perTaskDataTable) { + perTaskDataTable.destroy(); + } + + perTaskDataTable = new DataTable('.fine-tasks-results-table', { + perPage: 10, + perPageSelect: false, + searchable: false, + sortable: true, + fixedHeight: true, + labels: { + info: '' // This removes the "Showing 1 to X of Y entries" text + } + }); + + }) + .catch(error => { + console.error('Error fetching CSV:', error); + perTaskTableContainer.innerHTML = `Error loading data: ${error.message}
`; + }); + } + + const perTaskControls = document.createElement('div'); + perTaskControls.className = 'controls fine-tasks-controls'; + + // Task folder control group + const taskFolderControlGroup = document.createElement('div'); + taskFolderControlGroup.className = 'control-group'; + const taskFolderLabelPerTask = document.createElement('label'); + taskFolderLabelPerTask.textContent = 'Task Set: '; + const taskFolderDropdownPerTask = createDropdown(taskFolders, updatePerTaskResults); + taskFolderDropdownPerTask.value = 'selected'; // Set default to FineTasks + taskFolderControlGroup.appendChild(taskFolderLabelPerTask); + taskFolderControlGroup.appendChild(taskFolderDropdownPerTask); + + // Language control group + const languageControlGroup = document.createElement('div'); + languageControlGroup.className = 'control-group'; + const languageLabelPerTask = document.createElement('label'); + languageLabelPerTask.textContent = 'Language: '; + const languageDropdownPerTask = createDropdown(Object.keys(languageMap), updatePerTaskResults); + languageControlGroup.appendChild(languageLabelPerTask); + languageControlGroup.appendChild(languageDropdownPerTask); + + // Table type control group + const tableTypeControlGroup = document.createElement('div'); + tableTypeControlGroup.className = 'control-group'; + const tableTypeLabelPerTask = document.createElement('label'); + tableTypeLabelPerTask.textContent = 'Type: '; + const tableTypeDropdownPerTask = createDropdown(tableTypes, updatePerTaskResults); + tableTypeControlGroup.appendChild(tableTypeLabelPerTask); + tableTypeControlGroup.appendChild(tableTypeDropdownPerTask); + + // Metric control group + const metricControlGroup = document.createElement('div'); + metricControlGroup.className = 'control-group'; + const metricLabelPerTask = document.createElement('label'); + metricLabelPerTask.textContent = 'Criteria: '; + const metricDropdownPerTask = createDropdown(metricTypes, updatePerTaskResults); + metricDropdownPerTask.value = 'max_score'; // Set default to Max Score + metricControlGroup.appendChild(metricLabelPerTask); + metricControlGroup.appendChild(metricDropdownPerTask); + + perTaskControls.appendChild(taskFolderControlGroup); + perTaskControls.appendChild(languageControlGroup); + perTaskControls.appendChild(tableTypeControlGroup); + perTaskControls.appendChild(metricControlGroup); + + container.appendChild(perTaskControls); + // container.appendChild(perTaskTitleElement); + container.appendChild(perTaskTableContainer); + + // Initialize with default values + updatePerTaskResults(); +} diff --git a/app/src/index.html b/app/src/index.html new file mode 100644 index 0000000000000000000000000000000000000000..28f13d03cfffbf5fe0cc6df3a5242322365d6fbf --- /dev/null +++ b/app/src/index.html @@ -0,0 +1,388 @@ + + + + + + + +Following the strong community reception of our FineWeb English dataset
However, we quickly encountered a significant challenge: how can one effectively evaluate models across different languages during training?
+ +For English, it's straightforward: we can utilize well-established benchmarks like MMLU
To address these challenges, we developed a scalable and data-driven framework for evaluation task selection, which allows anyone to choose strong model evaluations for their language from existing tasks! We then applied this framework to a set of 9 diverse languages, resulting in the creation of FineTasks - a comprehensive and diverse multilingual evaluation suite.
+ +In this blog post, we discuss:
+Covering all 7000+ languages spoken over the world would be monumental endeavor, so we settled on using 9 languages that offered diversity in script, language family and resource availability: Chinese, French, Arabic, Russian, Thai, Hindi, Turkish, Swahili, and Telugu.
+For these languages, we collected all available tasks that we could find, implementing a total of 185 tasks across languages in LightEval, HuggingFace's model evaluation library.
+ +Then, we began task selection with two primary goals: ensuring evaluation diversity, and making sure each task provided a reliable signal during pre-training.
+ +For evaluation diversity, we aimed to assess a broad range of model capabilities, including:
+We consider that tasks provide a reliable signal if they provide a dependable score. This means the score should be above the random baseline, increase as training progresses, show low variability across different seeds, and provide consistent model ranking at each training step
To thoroughly examine the signal our tasks provide, we trained many 1.5B parameter models for each language, using 30B tokens from subsets of the supported languages of the five largest openly available multilingual web datasets. These models were trained with the same hyperparameters and tokenizer. We then evaluated them at regular checkpoint intervals on the collected tasks (with no instruction and no system prompt in a 0-shot setting).
+ +This process required multiple evaluation runs for each task due to iterations on its implementation, resulting in a total of 73 000 GPU hours consumed 🔥!
+ +With 49 models trained we could finally define what a reliable signal means to us!
+ +One of our core requirements for a task is that it can be learned from training data and this learning can be gradually observed as the training progresses. Without this improvement through time, it's uncertain whether there will ever be an improvement in the future.
+ +To measure this, we used the Spearman rank correlation to quantify the correlation between steps and score. Spearman rank correlation can capture monotonicity even when scores don't evolve linearly with the number of steps. We required each task to have at least an average correlation of 0.5 over all model training runs.
+ + +When comparing model performance on tasks, we need to consider whether differences are due to evaluation noise or genuine performance variations.
+ +Noise can arise from the stochastic processes involved in model training, such as random token sampling, data shuffling, or model initialization.
For each task, we computed:
+We aimed for each task to have an SNR > 20. The only exception to this rule are generative tasks, which typically have relatively low SNR, but are still worth including as they provide insights into how the model behaves when prompted to generate unconstrained (without answer options). In a multilingual setting, this is particularly relevant as some models trained on multiple languages can exhibit high task scores but then suddenly reply in the wrong language for generative tasks!
+ +Many model capabilities are acquired later in training, thus many tasks (especially harder ones, such as math-related ones) show baseline-level performance for an extended period. While these tasks are useful, they're not ideal for early pre-training evaluation, and we did not want to keep them for this setting.
+ +We first computed the baseline random performance of the task (as the sum of 1/n_choices for all samples for multiple choice questions, and as zero for generative evaluations). Then we calculated the task's distance from the baseline as the maximum score across all models minus the baseline.
+ + + +Let's not forget that the main goal of these evaluations is to compare models and datasets!
+ +In the future, we want to use these evaluations to select the best datasets for full model pretraining. This means our tasks should rank datasets trained using very few tokens (we typically run data ablations on 30B tokens), in the same order as they would when trained for longer, after significantly more steps.
+ +In other words, we would like tasks to have predictive capability regarding future performance during pre-training: if pre-training dataset A outperforms pre-training dataset B at 30 billion tokens, we would like this trend to continue at 300 billion tokens.
+ +Proving this is inherently impossible, but there is a necessary preliminary condition that we can test for: for the results to be consistent at large scales, they must also first show consistency at smaller scales!
+ +To measure this consistency in task ordering, we computed the average Kendall's Tau of models ranking between every two consecutive steps. We only considered steps starting after 15B tokens of pre-training, as we found orderings before the range incredibly noisy. A high value of this metric indicates that the ordering remains consistent as training progresses.
+ + + +Now that we covered what we were looking for in our tasks, let's examine two important aspects that can affect the above properties: task formulations and metric choice.
+ + + +The way tasks are presented to the model is crucial, particularly for multiple-choice (MC) tasks. In these scenarios, we must carefully determine how the choices are displayed and what the model is expected to predict.
+ +There are two common approaches: Cloze Formulation (CF) and Multi-Choice Formulation (MCF). In CF, choices are not provided in context, allowing the model to predict each option directly. In contrast, MCF presents the choices in the prompt, using A/B/C/D prefixes, with the targets being those letter prefixes.
+ + + +It's important to know that:
+Therefore, we decided to utilize CF for task selection and MCF for later evaluation of major open source models, as they have generally undergone enough training for these evaluations to have a signal.
+ +As the targets in CF of multiple choice tasks are choices themselves, each target can have a different number of tokens, characters, and unconditional probability (probability of generating the choice without a context prefix).
+ + +To account for this, we consider the following accuracy variations:
+acc
= acc_char
= acc_token
= acc_pmi
= Where
For our generative tasks on the other hand, we used the following metrics:
+prefix_match
: Exact match where only the prefix of the answer must matchf1
: F1 score computed over predicted/gold words extracted using a word tokenizerFor both generative metrics, minor preprocessing is applied to remove articles and punctuation, and lowercase the text.
+ +With our goals and evaluation setup properly defined, we proceeded with task selection!
+ +We reviewed tasks one by one, choosing based on the quantified properties. For each language, we aimed to have at least one task for each of the four categories outlined above. Additionally we wanted to have at least 1 generative task for each language.
+ +In cases where multiple versions of a task existed (e.g., MMLU with different translation methods or native versions), we prioritized native versions as long as their metrics were reasonable, followed by human translations of English tasks. If no such version was available, we made our selection entirely based on metrics.
+ +Thus, after removing about half of the tasks, we arrived at 96 final ones, forming "FineTasks."
+ +Use the dropdowns below to navigate the list of tasks and how different metrics affect them.
+ + +All tasks from the selection comply with the criteria outlined in previous sections, with the only exception being indicqa_tel, which we chose to include to ensure we had at least one generative task for Telugu. Overall we managed to cover all task categories for each language (the only exception being Thai Reasoning, where all tasks were unfortunately too noisy with low monotonicity to consider them).
+ +One of the biggest surprises was that some tasks, even when translated using the same method, were reliable in one language but not in others. This was evident with xWinograd, which worked quite well for Russian but did not meet our conditions for French. An even more extreme example was XNLI, which performed well for 6 out of 7 languages, failing to satisfy the reliability properties for Chinese. We had to test four different implementations before finding a reliable version, which, interestingly, was the only one that was created by native speakers and not machine translated.
+ +Feel free to use the dropdowns below to explore the evolution of scores over training for all tested tasks and metrics.
+ + + +Selecting the best evaluation metrics proved to be a challenging task. Not only is there no single metric that consistently outperforms the rest, but we often encountered situations where one metric had better monotonicity while another had a higher signal-to-noise ratio. In such cases, we typically made our decision based on the selected metric for tasks' implementation in a different language. We are aware that such hand-picking is often not possible and thus offer the following recommendations:
+ +For generative metrics, the choice is clearer: we suggest using the F1 score unless exact matching is required, as in math-related tasks. F1 is generally less noisy and more resilient to small changes in the generations.
+ + +Since we spent a lot of time and compute on task selection, we were interested in how well major open-source models would do on FineTasks. Given that our evaluation suite primarily targets pretrained models, we focused on these, with a few exceptions for models that don't offer a base (pretrained) version. These exceptions were included mainly out of curiosity, and their results should be interpreted with caution. Such models may significantly outperform other models due to the inclusion of supervised fine-tuning (SFT) data.
+ +To assess the multilingual performance disparity between open-source and closed-source models, we expanded our selection by adding a closed source model: gpt-4o-mini.
+ +As outlined in the task formulations, we are using MCF for this evaluation and employing a 5-shot approach, as recommended by OLMES
In the previous sections, we treated each task independently. However, to determine an overall "multilingual" score of a model, we need to aggregate the results from these tasks. We begin by rescaling the individual task scores in line with the OpenLLM leaderboard
For the final global "multilingual" score we followed a different approach. Instead of averaging the language scores directly, we ranked the model's performance across languages in comparison to other models and then averaged those rank scores. This method ensures that the result reflects the overall model's performance across all languages, preventing an exceptionally high score in one language from skewing the final outcome.
+ +After spending even more compute 🔥 on evaluating the selected models, we gathered the results in the following table. Here are our insights:
+ +The Qwen models perform exceptionally well, taking both first and second place with their 72B and 32B versions. Their key strength appears to be in handling high- and mid-resource languages (particularly Chinese), where they consistently ranked first. However, they struggled with lower-resource languages, especially Swahili and Telugu, where their performance lagged.
+ +The most surprising finding from our evaluation is how models explicitly trained to specialize in a narrow set of languages — like Sarvam-2B-v0.5 for Telugu, or Typhoon-v1.5-8B for Thai — tend to perform exceptionally well on generative tasks, while falling short when it comes to reasoning and general knowledge (GK) tasks, oftentimes getting close to random performance. We hypothesize two explanations: The models haven't undergone extensive enough training to be able to understand the MCF format or the higher exposure to various languages and especially English allows the non-specialized models to perform better at such GK/RES tasks. We note that good generative task performance reveals a good understanding of the target language.
+ +The only exceptions to this rule are typhoon-v1.5-72b and Yi-1.5-34B, both tackling the RES/GK tasks well and managing to rank in the top 4 for their respective languages. We note that typhoon-v1.5-72b is based on Qwen models, and that Yi also included English in its training data.
+ +Although it didn't take first place, Gemma2 performed really well in the multilingual domain, especially considering its size. It showed consistent results across all the languages we tested, excelling in low-resource languages like Telugu and Swahili. For anyone working with low-resource languages, we highly recommend Gemma-2 as a strong option.
+ +As mentioned in the beginning, comparing closed-source models requires extra caution. These models often undergo extensive supervised fine-tuning (SFT), employ highly optimized prompting techniques, and may even generate multiple responses and select the best one. Despite these advantages, the o4-mini ranks only just above the medium-sized 27B Gemma-2. Based on this evidence, we believe that the gap between open-source and closed-source models is very narrow, if not entirely negligible.
+ +If you would like to evaluate your models on FineTasks and expand the above table we made it easy for you. Simply run the following command with your model of choice:
+ +lighteval accelerate\
+ --model_args vllm,pretrained=model_name,pairwise_tokenization=True \
+ --custom_task lighteval.tasks.multilingual.tasks \
+ --tasks 'examples/tasks/finetasks/{cf,mcf}/{ara,fra,rus,tur,swa,hin,tel,tha,zho}' \
+ --max_samples '1000'
+
+ FineTasks is just the beginning of our multilingual journey. As a first step in the creation of the future FineWeb multilingual release, we are using this evaluation setup to curate a high quality pretraining dataset covering a large number of languages. You can expect more news from us soon! We plan to also continue working to make evaluation in non-English domains as seamless as it is in English—and we need your help to achieve that!
+ +LightEval now supports over 550 tasks across various non-English languages, making it the evaluation framework with the best multilingual coverage available. However, there's still much more to do. For many languages, no tasks exist yet, despite our ongoing work. This is where we believe the strong Hugging Face community can make a difference.
+ +We've made it incredibly easy to contribute new tasks, by developing a templating system which supports most of the popular task types, while maintaining authenticity of native language use, right down to correct punctuation. Even if you aren't able to contribute full evaluation tasks, you can still help. Many languages currently lack translations for anchor words used in evaluation, leaving hundreds of tasks unusable. You can help fill this gap by adding them following our mini guide.
+ +We're looking forward to revisiting this analysis in the future, not with just 9 languages, but at least 50—thanks to community contributions! Let's level the playing field between English and other languages together! 🤗
+For attribution in academic contexts, please cite this work as
+Kydlicek, et al., "FineTasks: Finding signal in a haystack of 200+ multilingual tasks", 2024.+
BibTeX citation
+@misc{kydlicek2024finetasksmultilingualtasks, + title={FineTasks: Finding signal in a haystack of 200+ multilingual tasks}, + author={Hynek Kydlíček and Guilherme Penedo and Clémentine Fourier and Nathan Habib and Thomas Wolf}, + url={https://huggingface.co./spaces/HuggingFaceFW/blogpost-fine-tasks}, +}+
Error loading data: ${error.message}
`; + } + } + + function updateTable() { + const extraColumn = languageDropdown.value === 'All Languages' ? 'All Languages' : + (extraColumnDropdown.value === 'None' ? null : extraColumnDropdown.value); + + tableContainer.innerHTML = ''; + const tableWrapper = createResultsTable(currentData, extraColumn); + tableContainer.appendChild(tableWrapper); + + if (leaderboardDataTable) { + leaderboardDataTable.destroy(); + } + + leaderboardDataTable = new DataTable('.leaderboard-results-table', { + perPage: 10, + perPageSelect: false, + searchable: false, + sortable: true, + fixedHeight: true, + labels: { + info: '' // This removes the "Showing 1 to X of Y entries" text + } + }); + + // Adjust column widths after the table is created + setTimeout(adjustColumnWidths, 0); + } + + const controls = document.createElement('div'); + controls.className = 'controls leaderboard-controls fine-tasks-controls'; + + const languageControlGroup = document.createElement('div'); + languageControlGroup.className = 'control-group'; + languageControlGroup.appendChild(languageLabel); + languageControlGroup.appendChild(languageDropdown); + + const extraColumnControlGroup = document.createElement('div'); + extraColumnControlGroup.className = 'control-group'; + extraColumnControlGroup.appendChild(extraColumnLabel); + extraColumnControlGroup.appendChild(extraColumnDropdown); + + controls.appendChild(languageControlGroup); + controls.appendChild(extraColumnControlGroup); + + container.appendChild(titleElement); + container.appendChild(tableContainer); + container.appendChild(captionElement); // Add caption below the table + container.appendChild(controls); + + // Initialize with All Languages data + languageDropdown.value = 'All Languages'; + updateLanguageTable(); +} + +function adjustColumnWidths() { + const table = document.querySelector('.leaderboard-results-table'); + if (!table) return; + + const columns = table.querySelectorAll('th'); + columns.forEach((column, index) => { + const columnClass = column.className; + const cells = table.querySelectorAll(`td.${columnClass}`); + let maxWidth = column.offsetWidth; + cells.forEach(cell => { + maxWidth = Math.max(maxWidth, cell.offsetWidth); + }); + + let adjustedWidth; + if (index === 0) { // Rank column + adjustedWidth = 50; + } else if (index === 1) { // Model name column + adjustedWidth = 200; + } else if (index === 2) { // Macro score column + adjustedWidth = 100; + } else { // Extra column or any other column + adjustedWidth = Math.min(maxWidth, 150); // Set a maximum width of 150px for other columns + } + + column.style.width = `${adjustedWidth}px`; + cells.forEach(cell => { + cell.style.width = `${adjustedWidth}px`; + }); + }); +} diff --git a/app/src/plot_task.js b/app/src/plot_task.js new file mode 100644 index 0000000000000000000000000000000000000000..28de2636864e3aa411ea45fd3400617a8fdec3b3 --- /dev/null +++ b/app/src/plot_task.js @@ -0,0 +1,619 @@ +import Plotly from 'plotly.js-basic-dist-min'; +import Papa from 'papaparse'; +import _ from 'lodash'; +import { getColor } from './colors.mjs'; + +const languageMap = { + 'Arabic': 'ar', + 'Turkish': 'tr', + 'Swahili': 'sw', + 'Russian': 'ru', + 'Telugu': 'te', + 'Thai': 'th', + 'Chinese': 'zh', + 'French': 'fr', + 'Hindi': 'hi' +}; + +const runNameMap = { + "orion": "Orion", + "helios": "Helios", + "lynx": "Lynx", + "aquila": "Aquila", + "commoncrawl": "CommonCrawl", + "baseline": "Baseline" +}; + +const taskLists = { + ar: ['acva_ara:_average', 'alfgahafa_mlqa_ara_cf', 'alghafa_arc_ara_cf:easy', 'alghafa_facts_ara_cf', 'alghafa_meta_dialects_ara_cf', 'alghafa_mmlu_ara_cf:_average', 'alghafa_openbookqa_ara_cf', 'alghafa_piqa_ara_cf', 'alghafa_race_ara_cf', 'alghafa_rating_sentiment_ara_cf', 'alghafa_rating_sentiment_no_neutral_ara_cf', 'alghafa_sciqa_ara_cf', 'alghafa_sentiment_ara_cf', 'arcd_ara', 'belebele_arb_Arab_cf', 'boolq_ara', 'exams_ara_cf:_average', 'mkqa_ara:_average', 'mlmm_arc_ara_cf:challenge', 'mlmm_hellaswag_ara_cf', 'mlmm_mmlu_ara_cf:_average', 'mlmm_truthfulqa_ara_cf:mc1', 'mlmm_truthfulqa_ara_cf:mc2', 'mlqa_ara', 'mmlu_ara_cf:_average', 'soqal_ara_cf', 'toxigen_ara_cf', 'tydiqa_ara', 'xcodah_ara_cf', 'xcopa_ara_cf', 'xcsqa_ara_cf', 'xnli2.0_ara_cf', 'xnli_ara_cf', 'xquad_ara', 'xstory_cloze_ara_cf'], + fr: ['belebele_fra_Latn_cf', 'community_boolq_fra_cf', 'exams_fra_cf:_average', 'fquadv2_fra', 'frenchbench_arc_fra_cf:challenge', 'frenchbench_hellaswag_fra_cf', 'meta_mmlu_fra_cf:_average', 'mintaka_fra', 'mkqa_fra:_average', 'mlmm_arc_fra_cf:challenge', 'mlmm_hellaswag_fra_cf', 'mlmm_mmlu_fra_cf:_average', 'mlmm_truthfulqa_fra_cf:mc1', 'mlmm_truthfulqa_fra_cf:mc2', 'pawsx_fra_cf', 'xcodah_fra_cf', 'xcsqa_fra_cf', 'xnli2.0_fra_cf', 'xwinograd_fra_cf'], + hi: ['belebele_hin_Deva_cf', 'community_arc_hin_cf:challenge', 'community_arc_hin_cf:easy', 'community_boolq_hin', 'community_hellaswag_hin_cf', 'indicnxnli_hin_cf', 'indicqa_hin', 'indicxcopa_hin_cf', 'meta_mmlu_hin_cf:_average', 'mintaka_hin', 'mlmm_arc_hin_cf:challenge', 'mlmm_hellaswag_hin_cf', 'mlmm_mmlu_hin_cf:_average', 'mlmm_truthfulqa_hin_cf:mc1', 'mlmm_truthfulqa_hin_cf:mc2', 'mlqa_hin', 'xcodah_hin_cf', 'xcsqa_hin_cf', 'xnli2.0_hin_cf', 'xnli_hin_cf', 'xquad_hin', 'xstory_cloze_hin_cf'], + ru: ['belebele_rus_Cyrl_cf', 'chegeka_rus', 'mathlogic_qa_rus_cf', 'mera_openbookqa_rus_cf', 'mera_worldtree_rus_cf', 'mkqa_rus:_average', 'mlmm_arc_rus_cf:challenge', 'mlmm_hellaswag_rus_cf', 'mlmm_mmlu_rus_cf:_average', 'mlmm_truthfulqa_rus_cf:mc1', 'mlmm_truthfulqa_rus_cf:mc2', 'parus_rus_cf', 'rcb_rus_cf', 'rummlu_rus_cf:_average', 'sber_squad_rus', 'tydiqa_rus', 'xcodah_rus_cf', 'xcsqa_rus_cf', 'xnli2.0_rus_cf', 'xquad_rus', 'xstory_cloze_rus_cf', 'xwinograd_rus_cf'], + sw: ['afric_mmlu_swa_cf:_average', 'afric_xnli_swa_cf', 'belebele_swh_Latn_cf', 'community_arc_swa_cf:challenge', 'community_arc_swa_cf:easy', 'community_mmlu_swa_cf', 'kenswquad_swa', 'm3exams_swa_cf', 'openai_mmlu_swa_cf:_average', 'tydiqa_swa', 'xcodah_swa_cf', 'xcopa_swa_cf', 'xcsqa_swa_cf', 'xnli2.0_swa_cf', 'xnli_swa_cf', 'xstory_cloze_swa_cf'], + te: ['belebele_tel_Telu_cf', 'community_hellaswag_tel_cf', 'indicnxnli_tel_cf', 'indicqa_tel', 'indicxcopa_tel_cf', 'mlmm_arc_tel_cf:challenge', 'mlmm_hellaswag_tel_cf', 'mlmm_mmlu_tel_cf:_average', 'mlmm_truthfulqa_tel_cf:mc1', 'mlmm_truthfulqa_tel_cf:mc2', 'tydiqa_tel', 'xstory_cloze_tel_cf'], + th: ['belebele_tha_Thai_cf', 'community_hellaswag_tha_cf', 'm3exams_tha_cf', 'meta_mmlu_tha_cf:_average', 'mkqa_tha:_average', 'thai_exams_tha_cf:_average', 'thai_exams_tha_cf:tgat', 'thaiqa_tha', 'wsci_tha_cf', 'xcopa_tha_cf', 'xnli2.0_tha_cf', 'xnli_tha_cf', 'xquad_tha'], + tr: ['belebele_tur_Latn_cf', 'community_arc_tur_cf:easy', 'community_hellaswag_tur_cf', 'community_mmlu_tur_cf:_average', 'community_truthfulqa_tur_cf:mc1', 'community_truthfulqa_tur_cf:mc2', 'community_xwinograd_tur_cf', 'exams_tur_cf:_average', 'mkqa_tur:_average', 'tquadv2_tur', 'xcopa_tur_cf', 'xnli2.0_tur_cf', 'xnli_tur_cf', 'xquad_tur'], + zh: ['agieval_zho_cf:_average', 'belebele_zho_Hans_cf', 'c3_zho_cf', 'ceval_zho_cf:_average', 'chinese_squad_zho', 'cmath_zho_cf', 'cmmlu_zho_cf:_average', 'cmnli_zho_cf', 'cmrc2018_zho', 'm3exams_zho_cf', 'mkqa_zho:_average', 'mlmm_arc_zho_cf:challenge', 'mlmm_hellaswag_zho_cf', 'mlmm_mmlu_zho_cf:_average', 'mlmm_truthfulqa_zho_cf:mc1', 'mlmm_truthfulqa_zho_cf:mc2', 'ocnli_zho_cf', 'pawsx_zho_cf', 'xcodah_zho_cf', 'xcopa_zho_cf', 'xcsqa_zho_cf', 'xnli2.0_zho_cf', 'xnli_zho_cf', 'xquad_zho', 'xstory_cloze_zho_cf', 'xwinograd_zho_cf'] +}; + +const LINE_SETTINGS = { + width: 2.5, + type: "scatter", + mode: "lines+markers", +}; + +const DEFAULT_LAYOUT = { + font: { + family: "apple-system, Arial, sans-serif", + }, + title: { + font: { + size: 15, + }, + }, + xaxis: { + title: { + text: "Training Tokens (billions)", + font: { + size: 14, + }, + }, + tickfont: { + size: 12, + }, + showgrid: false, + mirror: true, + ticks: "outside", + showline: true, + }, + yaxis: { + title: { + font: { + size: 14, + }, + standoff: 10, + }, + showgrid: false, + mirror: true, + ticks: "outside", + showline: true, + tickfont: { + size: 12, + }, + }, + height: 300, // You can adjust this value + autosize: true, + legend: { + orientation: 'h', // Set to 'h' for horizontal legend (required for columns) + yanchor: 'bottom', + y: 0, // Position at the bottom + xanchor: 'right', + x: 1, // Position at the right + traceorder: 'normal', + font: { size: 12 }, + tracegroupgap: 0, // Space between legend items + bgcolor: 'rgba(255, 255, 255, 0.8)' // White background with 70% transparency (1 - 0.3 = 70%) + }, + margin: { + t: 25, + b: 60, + l: 60, + r: 40, + }, +}; + +export function initPlotApplets() { + const plotContainers = document.querySelectorAll('.task-signal-plot'); + plotContainers.forEach(container => { + initPlotApplet(container); + }); +} + +function initPlotApplet(container) { + const defaultLanguage = container.dataset.language || 'Arabic'; + const defaultTask = container.dataset.task || ''; + const defaultMetric = container.dataset.metric || ''; + const groupSeeds = container.dataset.groupSeeds === 'true'; + const showControls = container.dataset.showControls === 'true'; + const taskMetrics = (container.dataset.taskMetrics || 'monotonicity,snr,ordering,randomness').split(","); + + const controls = createControls(container, defaultLanguage, defaultTask, defaultMetric, taskMetrics); + if (!showControls) + controls.style.display = 'none'; + container.appendChild(controls); + + const plotContainer = document.createElement('div'); + plotContainer.className = 'plot-container'; + container.appendChild(plotContainer); + + const statsContainer = document.createElement('div'); + statsContainer.className = 'stats-container'; + container.appendChild(statsContainer); + + + // Create an initial empty plot + Plotly.newPlot(plotContainer, []); + + // Set up the resize function + const resizePlot = () => { + const width = container.offsetWidth; + Plotly.relayout(plotContainer, { width: width }); + }; + + // Add resize listener + window.addEventListener('resize', resizePlot); + + // Initial resize + resizePlot(); + + // Load the initial data + updateLanguageTasks(container, defaultTask, defaultMetric, groupSeeds, taskMetrics); +} + +function createControls(container, defaultLanguage, defaultTask, defaultMetric, taskMetrics) { + const controls = document.createElement('div'); + controls.className = 'controls'; + + const languageSelect = createSelect('language', Object.keys(languageMap), () => updateLanguageTasks(container, '', '', true, taskMetrics)); + languageSelect.value = defaultLanguage; + + const taskSelect = createSelect('task', [], () => updateMetrics(container, '', true, taskMetrics)); + const metricSelect = createSelect('metric', [], () => updatePlot(container, taskMetrics)); + + controls.appendChild(createControlGroup('Language:', languageSelect)); + controls.appendChild(createControlGroup('Task:', taskSelect)); + controls.appendChild(createControlGroup('Metric:', metricSelect)); + + return controls; +} + +function createSelect(id, options, onChangeHandler) { + const select = document.createElement('select'); + select.id = id; + options.forEach(option => { + const optionElement = document.createElement('option'); + optionElement.value = option; + optionElement.textContent = option; + select.appendChild(optionElement); + }); + select.addEventListener('change', onChangeHandler); + return select; +} + +function createControlGroup(labelText, inputElement) { + const group = document.createElement('div'); + group.className = 'control-group'; + + const label = document.createElement('label'); + label.textContent = labelText; + label.className = 'control-label'; + + group.appendChild(label); + group.appendChild(inputElement); + + return group; +} + +async function updateLanguageTasks(container, defaultTask = '', defaultMetric = '', groupSeeds, taskMetrics) { + const languageSelect = container.querySelector('#language'); + const taskSelect = container.querySelector('#task'); + const language = languageSelect.value; + const langCode = languageMap[language]; + + taskSelect.innerHTML = ''; + + try { + const tasks = await getTasksForLanguage(langCode); + + taskSelect.innerHTML = ''; + if (tasks.length > 0) { + tasks.forEach(task => { + const option = document.createElement('option'); + option.value = task; + option.textContent = truncateText(task, 25); // Reduced from 30 to 25 + option.title = task; // Set full task name as title for tooltip + taskSelect.appendChild(option); + }); + + if (defaultTask && tasks.includes(defaultTask)) { + taskSelect.value = defaultTask; + } else { + taskSelect.selectedIndex = 0; + } + + await updateMetrics(container, defaultMetric, groupSeeds, taskMetrics); + } else { + taskSelect.innerHTML = ''; + clearPlot(container); + } + } catch (error) { + console.error('Error fetching tasks:', error); + taskSelect.innerHTML = ''; + clearPlot(container); + } +} + +async function getTasksForLanguage(langCode) { + return taskLists[langCode] || []; +} + +async function updateMetrics(container, defaultMetric = '', groupSeeds, taskMetrics) { + const language = container.querySelector('#language').value; + const task = container.querySelector('#task').value; + const langCode = languageMap[language]; + const metricSelect = container.querySelector('#metric'); + + metricSelect.innerHTML = ''; + + try { + const metrics = await getMetricsForTask(langCode, task); + + metricSelect.innerHTML = ''; + metrics.forEach(metric => { + const option = document.createElement('option'); + option.value = metric; + option.textContent = metric; + metricSelect.appendChild(option); + }); + + if (defaultMetric && metrics.includes(defaultMetric)) { + metricSelect.value = defaultMetric; + } else if (metricSelect.options.length > 0) { + metricSelect.selectedIndex = 0; + } + + await updatePlot(container, taskMetrics); + } catch (error) { + console.error('Error fetching metrics:', error); + metricSelect.innerHTML = ''; + clearPlot(container); + } +} + +async function getMetricsForTask(langCode, task) { + return new Promise((resolve, reject) => { + Papa.parse(`data/nanotron_tasks/${langCode}/${task}_stats.csv`, { + download: true, + header: true, + complete: function(results) { + const metrics = [...new Set(results.data.map(row => row.metric))]; + resolve(metrics); + }, + error: function(error) { + console.error('Error fetching metrics:', error); + reject(error); + } + }); + }); +} + +function updatePlot(container, taskMetrics) { + const language = container.querySelector('#language').value; + const task = container.querySelector('#task').value; + const metric = container.querySelector('#metric').value; + const title = container.dataset.title; + const langCode = languageMap[language]; + + if (!langCode || !task || !metric) { + clearPlot(container); + return; + } + + const dataUrl = `data/nanotron_tasks/${langCode}/${task}_data.csv`; + const statsUrl = `data/nanotron_tasks/${langCode}/${task}_stats.csv`; + + Promise.all([ + new Promise((resolve, reject) => { + Papa.parse(dataUrl, { + download: true, + header: true, + dynamicTyping: true, + complete: resolve, + error: reject + }); + }), + new Promise((resolve, reject) => { + Papa.parse(statsUrl, { + download: true, + header: true, + dynamicTyping: true, + complete: resolve, + error: reject + }); + }) + ]).then(([dataResult, statsResult]) => { + const taskData = dataResult.data; + const statsData = statsResult.data; + plotData(container, taskData, statsData, metric, title, taskMetrics); + }).catch(error => { + console.error('Error parsing CSV:', error); + clearPlot(container); + }); +} + +function plotData(container, data, stats, metric, title, taskMetrics) { + const groupSeeds = container.dataset.groupSeeds === 'true'; + const sortedData = sortDataByTokens(data); + const groupedData = groupDataByRunname(sortedData, groupSeeds, metric); + const interpolatedData = interpolateData(groupedData, metric); + const smoothedData = smoothData(interpolatedData, metric); + const traces = createTraces(smoothedData, metric); + + const plotContainer = container.querySelector('.plot-container'); + + const layout = _.merge({}, DEFAULT_LAYOUT, { + title: { text: `${title}` }, + xaxis: { + title: { text: 'Training Tokens (billions)' }, + tickvals: [0, 5, 10, 15, 20, 25], + ticktext: ['0', '5B', '10B', '15B', '20B', '25B'], + tickangle: 45, + range: [0, 30], // Set the range to start from 0 and end at 30B + }, + yaxis: { + title: { text: 'Score' }, + range: [Math.min(...traces.flatMap(trace => trace.y)) * 0.95, Math.max(...traces.flatMap(trace => trace.y)) * 1.05], // Add 5% padding to the top and bottom + }, + width: container.offsetWidth, + }); + + Plotly.newPlot(plotContainer, traces, layout, {responsive: true}); + + // Display statistics + displayStatistics(container, stats, metric, taskMetrics); +} + +function displayStatistics(container, stats, metric, taskMetrics) { + const statsContainer = container.querySelector('.stats-container'); + const metricStats = stats.find(stat => stat.metric === metric); + if (metricStats) { + statsContainer.innerHTML = ` +No statistics available for this metric.
'; + } +} + +function getReducedTickValues(tokens) { + const uniqueTokens = [...new Set(tokens)].sort((a, b) => a - b); + const tokenCount = uniqueTokens.length; + const targetTickCount = 10; // Adjust this value to increase/decrease the number of ticks + + if (tokenCount <= targetTickCount) { + return uniqueTokens; + } + + const stride = Math.ceil(tokenCount / targetTickCount); + return uniqueTokens.filter((_, index) => index % stride === 0); +} + +function formatTickLabel(value) { + if (value >= 1e9) { + return (value / 1e9).toFixed(1) + 'B'; + } else if (value >= 1e6) { + return (value / 1e6).toFixed(1) + 'M'; + } else if (value >= 1e3) { + return (value / 1e3).toFixed(1) + 'K'; + } + return value.toString(); +} + +function computeStatistics(data, metric) { + const stats = { + avg_spearman: 0, + avg_kendall_tau_a: 0, + avg_snr: 0, + max_n_std: 0 + }; + + const baselineRun = Object.keys(data).find(key => key.toLowerCase().includes('baseline')); + const nonBaselineRuns = Object.keys(data).filter(key => key !== baselineRun); + + // Compute statistics for each non-baseline run + nonBaselineRuns.forEach(run => { + const runData = data[run]; + const tokens = runData.map(row => row.tokens); + const scores = runData.map(row => row[metric]); + + // Spearman correlation + stats.avg_spearman += spearmanCorrelation(tokens, scores); + + // Kendall Tau-a + const lastHalf = Math.floor(runData.length / 2); + const kendallTauValues = []; + for (let i = lastHalf; i < runData.length - 1; i++) { + kendallTauValues.push(kendallTauA(scores.slice(0, i + 1), scores.slice(0, i + 2))); + } + stats.avg_kendall_tau_a += _.mean(kendallTauValues); + + // SNR and max_n_std + if (baselineRun) { + const baselineScores = data[baselineRun].map(row => row[metric]); + const stdDev = standardDeviation(scores); + stats.avg_snr += _.mean(scores) / stdDev; + stats.max_n_std = Math.max(stats.max_n_std, (_.max(scores) - _.mean(baselineScores)) / stdDev); + } + }); + + // Average the statistics + const numRuns = nonBaselineRuns.length; + stats.avg_spearman /= numRuns; + stats.avg_kendall_tau_a /= numRuns; + stats.avg_snr /= numRuns; + + return stats; +} + +function spearmanCorrelation(x, y) { + const n = x.length; + const rankX = rankData(x); + const rankY = rankData(y); + + let sum_d_squared = 0; + for (let i = 0; i < n; i++) { + const d = rankX[i] - rankY[i]; + sum_d_squared += d * d; + } + + return 1 - (6 * sum_d_squared) / (n * (n * n - 1)); +} + +function rankData(data) { + const sorted = [...data].sort((a, b) => a - b); + return data.map(x => sorted.indexOf(x) + 1); +} + +function kendallTauA(x, y) { + const n = x.length; + let concordant = 0; + let discordant = 0; + + for (let i = 0; i < n; i++) { + for (let j = i + 1; j < n; j++) { + const sign_x = Math.sign(x[j] - x[i]); + const sign_y = Math.sign(y[j] - y[i]); + if (sign_x * sign_y > 0) concordant++; + else if (sign_x * sign_y < 0) discordant++; + } + } + + return (concordant - discordant) / (n * (n - 1) / 2); +} + +function standardDeviation(values) { + const mean = _.mean(values); + const squareDiffs = values.map(value => { + const diff = value - mean; + return diff * diff; + }); + const avgSquareDiff = _.mean(squareDiffs); + return Math.sqrt(avgSquareDiff); +} + +function interpolateData(data, metric) { + return _.mapValues(data, (rows) => { + const sortedRows = _.sortBy(rows, 'tokens'); + const allTokens = _.uniq(_.flatMap(Object.values(data), rows => rows.map(r => r.tokens))).sort((a, b) => a - b); + + return allTokens.map(token => { + const exactMatch = _.find(sortedRows, { tokens: token }); + if (exactMatch) return exactMatch; + + const lowerRow = _.findLast(sortedRows, r => r.tokens < token); + const upperRow = _.find(sortedRows, r => r.tokens > token); + + if (!lowerRow) return { ...upperRow, tokens: token }; + if (!upperRow) return { ...lowerRow, tokens: token }; + + const ratio = (token - lowerRow.tokens) / (upperRow.tokens - lowerRow.tokens); + const interpolatedMetric = lowerRow[metric] + (upperRow[metric] - lowerRow[metric]) * ratio; + + return { + ...lowerRow, + tokens: token, + [metric]: interpolatedMetric + }; + }); + }); +} + +function smoothData(data, metric, windowSize = 3) { + return _.mapValues(data, (rows) => { + return rows.map((row, index, array) => { + const window = array.slice(Math.max(0, index - windowSize + 1), index + 1); + const smoothedMetric = _.meanBy(window, r => r[metric]); + return { ...row, [metric]: smoothedMetric }; + }); + }); +} + +function sortDataByTokens(data) { + return _.sortBy(data, 'tokens'); +} + +function groupDataByRunname(data, groupSeeds, metric) { + // Remove null or undefined runs + data = data.filter(row => row.runname != null && row.runname !== 'null_undefined'); + + if (!groupSeeds) { + return _.groupBy(data, row => `${processRunName(row.runname)}_${row.seed}`); + } + + const grouped = _.groupBy(data, row => processRunName(row.runname)); + + return _.mapValues(grouped, (rows) => { + const stepGroups = _.groupBy(rows, 'tokens'); + return _.map(stepGroups, (stepRows) => { + const meanMetric = _.meanBy(stepRows, row => parseFloat(row[metric]) || 0); + return { + ...stepRows[0], + [metric]: meanMetric + }; + }); + }); +} + +function processRunName(runname) { + for (const [key, value] of Object.entries(runNameMap)) { + if (runname.includes(key)) { + return value; + } + } + return runname; +} + +function createTraces(groupedData, metric) { + const colorsMapping = new Map(); + const sortedRunnames = Object.keys(groupedData).sort((a, b) => { + if (a.includes('baseline')) return 1; + if (b.includes('baseline')) return -1; + return a.localeCompare(b); + }); + + return sortedRunnames.map((runname, index) => { + const color = getColorForTrace(runname, colorsMapping, index); + return { + x: groupedData[runname].map(row => row.tokens), + y: groupedData[runname].map(row => row[metric]), + name: runname, + line: { + color: color, + shape: 'spline', + ...LINE_SETTINGS + }, + marker: { + color: color, + size: 6, + }, + mode: 'lines+markers', + }; + }); +} + +function getColorForTrace(traceName, colorsMapping, index) { + const reusedColor = colorsMapping.get(traceName); + if (reusedColor) { + return reusedColor; + } + + const color = getColor(index); + colorsMapping.set(traceName, color); + return color; +} + +function clearPlot(container) { + const plotContainer = container.querySelector('.plot-container'); + Plotly.purge(plotContainer); +} + +function truncateText(text, maxLength) { + if (text.length <= maxLength) return text; + return text.substr(0, maxLength - 2) + '..'; +} + diff --git a/app/src/stats.js b/app/src/stats.js new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/app/src/style.css b/app/src/style.css new file mode 100644 index 0000000000000000000000000000000000000000..fedc2e2579ac61ded10ef5eda16d9fce4e732717 --- /dev/null +++ b/app/src/style.css @@ -0,0 +1,366 @@ +/* style.css */ +/* Define colors */ +:root { + --distill-gray: rgb(107, 114, 128); + --distill-gray-light: rgb(185, 185, 185); + --distill-gray-lighter: rgb(228, 228, 228); + --distill-gray-lightest: rgb(245, 245, 245); + --distill-blue: #007BFF; +} + +d-byline .byline { + grid-template-columns: 1fr; + grid-column: text; + font-size: 0.9rem; + line-height: 1.8em; +} + +@media (min-width: 768px) { + d-byline .byline { + grid-template-columns: 5fr 1fr 1fr; + } +} + +d-contents > nav a.active { + text-decoration: underline; +} + +@media (max-width: 1199px) { + d-contents { + display: none; + justify-self: start; + align-self: start; + padding-bottom: 0.5em; + margin-bottom: 1em; + padding-left: 0.25em; + border-bottom: 1px solid rgba(0, 0, 0, 0.1); + border-bottom-width: 1px; + border-bottom-style: solid; + border-bottom-color: rgba(0, 0, 0, 0.1); + } +} + +d-contents a:hover { + border-bottom: none; +} + + +@media (min-width: 1200px) { + d-article { + /* Ensure d-article does not prevent sticky positioning */ + overflow: visible; + } + + d-contents { + align-self: start; + grid-column-start: 1 !important; + grid-column-end: 4 !important; + grid-row: auto / span 6; + justify-self: end; + margin-top: 0em; + padding-right: 3em; + padding-left: 2em; + /*border-right: 1px solid rgba(0, 0, 0, 0.1);*/ + /*border-right-width: 1px;*/ + /*border-right-style: solid;*/ + /*border-right-color: rgba(0, 0, 0, 0.1);*/ + position: -webkit-sticky; /* For Safari */ + position: sticky; + top: 10px; /* Adjust this value if needed */ + z-index: -1; + } +} + +d-contents nav h3 { + margin-top: 0; + margin-bottom: 1em; +} + +d-contents nav div { + color: rgba(0, 0, 0, 0.8); + font-weight: bold; +} + +d-contents nav a { + color: rgba(0, 0, 0, 0.8); + border-bottom: none; + text-decoration: none; +} + +d-contents li { + list-style-type: none; +} + +d-contents ul, d-article d-contents ul { + padding-left: 1em; +} + +d-contents nav ul li { + margin-bottom: .25em; +} + +d-contents nav a:hover { + text-decoration: underline solid rgba(0, 0, 0, 0.6); +} + +d-contents nav ul { + margin-top: 0; + margin-bottom: 6px; +} + + +d-contents nav > div { + display: block; + outline: none; + margin-bottom: 0.5em; +} + +d-contents nav > div > a { + font-size: 13px; + font-weight: 600; +} + +d-article aside { + height: 0px; + overflow: visible; + margin-bottom: 1em; + z-index: 1000; +} + +@media (min-width: 768px) { + d-article aside { + margin-bottom: 0; + } +} + +d-contents nav > div > a:hover, +d-contents nav > ul > li > a:hover { + text-decoration: none; +} + + +/* Controlls for Plotting Applet */ +.controls { + display: flex; + flex-wrap: nowrap; + gap: 10px; + justify-content: center; +} + +.control-group { + display: flex; + flex-direction: column; + align-items: center; +} + +.controls select { + padding: 2px 4px; + line-height: 1.5em; + text-align: center; + border-radius: 4px; + font-size: 0.7em; + background-color: var(--distill-gray-lightest); + outline: none; +} + +.controls label { + font-size: 0.8em; + font-weight: bold; +} + + +/* Specific style for the task dropdown */ +#task { + max-width: 180px; +} + +.controls select option { + max-width: 300px; + overflow: visible; +} + +.task-signal-plot { + width: 100%; + max-width: 500px; /* Adjust this value as needed */ + margin: 0 auto; +} + + +.stats-container { + margin-bottom: 5px; +} +.compact-stats { + display: grid; + place-items: center; + grid-template-columns: 1fr 1fr; + gap: 5px; + font-weight: bold; + font-size: 12px; +} +.compact-stats-single { + display: grid; + place-items: center; + font-weight: bold; + font-size: 12px; +} + +.fine-tasks-controls, +.leaderboard-controls { + margin-bottom: 20px; +} + +.fine-tasks-table-wrapper { + margin-top: 20px; +} + +.fine-tasks-results-table { + width: 100%; + border-collapse: separate; + border-spacing: 0; + table-layout: fixed; /* This ensures that the table respects column widths */ +} + +.fine-tasks-results-table th, +.fine-tasks-results-table td { + border: 1px solid #ddd; + padding: 8px; + text-align: left; + overflow: hidden; + text-overflow: ellipsis; + white-space: nowrap; +} + +.fine-tasks-results-table th { + background-color: #f9f9f9; + font-weight: bold; +} + +.fine-tasks-results-table tr:nth-child(even) { + background-color: #f2f2f2; +} + +.fine-tasks-results-table tr:nth-child(odd) { + background-color: #fff; +} + +.fine-tasks-results-table tr:hover { + background-color: #ddd; +} + +/* Remove the horizontal line above the pagination */ +.fine-tasks-table-wrapper .datatable-bottom { + border-top: none; +} + +/* Hide the "Showing X to Y of Z entries" text */ +.fine-tasks-table-wrapper .datatable-info { + display: none; +} + +.fine-tasks-title { + text-align: center; + margin-top: 20px; + margin-bottom: 20px; +} + +.fine-tasks-results-table td[title] { + cursor: help; +} + +.leaderboard-title { + text-align: center; + margin-top: 20px; + margin-bottom: 20px; +} + + +.leaderboard-results-table { + width: 100%; + border-collapse: separate; + border-spacing: 0; + table-layout: fixed; +} + +.leaderboard-results-table th, +.leaderboard-results-table td { + border: 1px solid #ddd; + padding: 8px; + text-align: left; + overflow: hidden; + text-overflow: ellipsis; + white-space: nowrap; +} + +.leaderboard-results-table th:first-child, +.leaderboard-results-table td:first-child { + width: 50px; /* For the rank column */ +} + +.leaderboard-results-table th:nth-child(2), +.leaderboard-results-table td:nth-child(2) { + width: 200px; /* For the model name column */ +} + +.leaderboard-results-table th:nth-child(3), +.leaderboard-results-table td:nth-child(3) { + width: 100px; /* For the macro score column */ +} + +.leaderboard-results-table th:nth-child(4), +.leaderboard-results-table td:nth-child(4) { + width: 150px; /* For the extra column */ +} + +.leaderboard-results-table th { + background-color: #f9f9f9; + font-weight: bold; +} + +.leaderboard-results-table tr:nth-child(even) { + background-color: #f2f2f2; +} + +.leaderboard-results-table tr:hover { + background-color: #ddd; +} + +/* Remove the horizontal line above the pagination */ +.leaderboard-table-wrapper .datatable-bottom { + border-top: none; +} + +/* Hide the "Showing X to Y of Z entries" text */ +.leaderboard-table-wrapper .datatable-info { + display: none; +} + +.leaderboard-results-table td[title] { + cursor: help; +} + +/* Tooltip styles */ +.leaderboard-results-table td[title]:hover::after { + content: attr(title); + position: absolute; + left: 0; + top: 100%; + background-color: #f9f9f9; + color: #000; + padding: 5px; + border: 1px solid #ddd; + border-radius: 4px; + z-index: 1000; + white-space: nowrap; + overflow: hidden; + text-overflow: ellipsis; + max-width: 300px; +} + +.table-caption { + text-align: center; + margin-bottom: 10px; + font-style: italic; + color: #666; + width: 100%; /* Ensure the figcaption takes full width */ + display: block; /* Make it a block element */ +} diff --git a/app/webpack.config.js b/app/webpack.config.js new file mode 100644 index 0000000000000000000000000000000000000000..23ed9fdcfa7adeda49c2ff90d77fe909929b4ed6 --- /dev/null +++ b/app/webpack.config.js @@ -0,0 +1,106 @@ +const path = require("path"); +const { CleanWebpackPlugin } = require("clean-webpack-plugin"); +const CopyPlugin = require("copy-webpack-plugin"); +const BundleAnalyzerPlugin = require("webpack-bundle-analyzer").BundleAnalyzerPlugin; + +const COLOR_KEYS = ["color", "bgColor", "fillcolor"]; + +const transformDataColors = async (data, path) => { + const {getNamedColor} = await import('./src/colors.mjs'); + // if not json file, return + if (!path.endsWith(".json")) { + return data; + } + const parsedData = JSON.parse(data); + // Change the color of the data + const deepIterateAndSetColor = (key, val) => { + if (val === null) { + return null; + } + if (val == undefined) { + return undefined; + } + if (Array.isArray(val)) { + return val.map(item => deepIterateAndSetColor(key, item)); + } + if (typeof val === "object") { + return Object.entries(val).reduce((newObj, [key, value]) => { + newObj[key] = deepIterateAndSetColor(key, value); + return newObj; + }, {}); + } + if (COLOR_KEYS.includes(key)) { + const [colorName, opacity, ...rest] = val.trim().split(/\s+/); + const floatOpacity = parseFloat(opacity); + const newColor = getNamedColor(colorName, floatOpacity); + if (newColor !== undefined && rest.length === 0 && !isNaN(floatOpacity)) { + console.log(`key: ${key} in file ${path} changed from ${val} to ${newColor}`); + return newColor; + } else { + return val; + } + } + return val; + }; + return JSON.stringify(deepIterateAndSetColor(undefined, parsedData)) +}; + +module.exports = { + entry: { + distill: "./src/distill.js", + main: "./src/index.js", + }, + output: { + filename: "[name].bundle.js", + path: path.resolve(__dirname, "dist"), + }, + module: { + rules: [ + { + test: /\.(js|mjs)$/, + exclude: /node_modules/, + use: { + loader: "babel-loader", + options: { + presets: ["@babel/preset-env"], + }, + }, + }, + { + test: /\.css$/, + use: ['style-loader', 'css-loader'], + }, + ], + }, + plugins: [ + new CleanWebpackPlugin(), + new CopyPlugin({ + patterns: [ + { + from: "assets", + to: "assets", + }, + { from: "src/style.css", to: "style.css" }, + { from: "src/bibliography.bib", to: "bibliography.bib" }, + { from: "src/index.html", to: "index.html" }, + { + from: "../analysis/data", + to: "data", + globOptions: { + ignore: ["**/*.json"], + }, + }, + ], + }), + ], + devtool: process.env.NODE_ENV === 'production' ? 'source-map' : 'eval-source-map', + devServer: { + static: "./dist", + open: process.env.NODE_ENV !== 'production', + hot: process.env.NODE_ENV !== 'production', + liveReload: process.env.NODE_ENV !== 'production', + }, + mode: process.env.NODE_ENV === 'production' ? 'production' : 'development', +}; + +console.log(process.env.NODE_ENV)