{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "1tp4Ct477dce" }, "source": [ "# inference_bert_synthsts.ipynb\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "jyutKgckKjd9", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "c9189e2d-d005-4904-ebd0-160f0d2c7824" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/156.5 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m156.5/156.5 kB\u001b[0m \u001b[31m4.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25h\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/280.0 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m280.0/280.0 kB\u001b[0m \u001b[31m18.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25h" ] } ], "source": [ "!pip install sentence-transformers transformers accelerate -q" ] }, { "cell_type": "code", "source": [ "model_name = \"BEE-spoke-data/bert-plus-L8-v1.0-syntheticSTS-4k\" # @param {type:\"string\"}\n", "#@markdown > try replacing the model with `sentence-transformers/all-mpnet-base-v2` and re-run to see the difference\n", "model_name" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 37 }, "cellView": "form", "id": "pM7zazZ-1_Ip", "outputId": "adaf0536-a28b-4fbb-ffa1-38ccb6da08aa" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "'BEE-spoke-data/bert-plus-L8-v1.0-syntheticSTS-4k'" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "string" } }, "metadata": {}, "execution_count": 2 } ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 418, "referenced_widgets": [ "d8cf459ffac147698f81bf68ec561a6e", "8cb55a0a6dcb4018a278b5d46e5526e7", "4c31b7c0ac1c47558a8b23bf0f1540e3", "03527f38cc41473680573988eb83a13b", "ed231ede5ca740ec945d7708c02eb1a8", "df218946f7a6478090da66882ea2889b", "046ac459f31f40baa0233688edfe6f04", "b7755b1f1b2b4bfdac787bfbc25a29d4", "cbae7c7bd69840d5a76a4158e41cdf61", "ee9e8221f09841aea255d3989660c607", "ed6f56dabaa5455281c0f87d4b0bbf3f", "9ebeabca35a34804be12f79704ee19a5", "a6caea795c5a4057a9fdc024c67913cd", "2925cefed9684372a0ec9b80d60257e6", "368e69ce12f643b6922c592e3df44372", "7067f08d0d8a471581356cc1076c5aea", "bc79670b8b6143e58b05328d9ce6da75", "de8bbe9f6f92409587dc4f79eaf837d6", "f88b889456ed4464a2118f0f6be0c76d", "e39db22546db44f89f6e8af1813a773a", "a060d88289584dd4b0835be34fda7770", "036430685e184adc9f33b3a64de9658f", "954f812c628a4a8c942ab0f06a7018d6", "864e66d3416c44c384d3141858c97f92", "21e19a10a1c1491990abaa0ff184f92f", "1a7fbc5246014717a5faba6a96e57483", "bea7124776fb4ab9aa5e9d43094c892f", "3341d2849b3a4a408503df8590fc9ccb", "a0a43631ce964f26b669ce3c9523a100", "c4e7592ca3a34ca7a9c179e4d4812018", "b77aa6c7fe0342719a5ff8e0fe8107c2", "761952ec1c9d4e9f88f8113c701db5bf", "6a99262a215c46229c3eab0494c2d00b", "aad4559b64454d49afd730aa73aa5851", "b053328102d04320a1f7d1fa094c0274", "9622430adfb148f0abc9fbeb169acc1a", "9b797d261b454191bcf4bbad863054e0", "3bcc1d754615494bb912249dde19bf73", "15edcd00cd3c45a1bf7814eafbd9b293", "dc4bd9c8f0104b4cabccded26ac90437", "ca5215c104b947c6a68e36b98facb157", "56e65545bfea4b65a5a03fe5a1213aec", "9f36c32befa84ba7ad75b24db006981d", "f131ad2f76f44aee91cfa102303cb47f", "2e584c661dbd4a0e9bd65ab5ebf2e47d", "3e1f53185dbc44c4ad902bfb94ba589c", "2b3c019b5cb444df9e11a67f6a312751", "0b89575f105f4a9aaa3170b86627438c", "fc7e1104a4c14adfbbc60f94ecdebe0c", "551b48c5035d4367bebcee676b53ef83", "6a177f33cd8d4145ac805c7f0257b05d", "d0c2f5d22cab4f29b8e4da2c805c4ce0", "80abcca27e8d4b2ba938c7629af86f68", "2113c5948aba450395c6b63715ea05d0", "772124fd896a49649e32bde0e8bddf11", "2bea672e24c44315ab0e064eab9bd922", "29a334d113f044f8840314796d21c2a6", "838cd703b1c04fd7888ac9dffeea8d0a", "c2b32b6fa58b465e869add0942c96d77", "92774eb6f7ed4bdd830e1391ba8939c6", "057460e860934735a04a055e86f235e6", "eabd151800b046f49577bd4ea631236c", "5ae7706ddd7b461d95abfbc7451be8a0", "e20fa5adca314e028382cb7795386f33", "9ecbfde490dc4e2ebefbf843a0f3b5ca", "37a0324928f848f496554d6a5e7104f6", "2800b55f336e49478e5e93244eca8cce", "b47e722639fa43bd9e13c3d2807993cf", "d14e4cdf2097427da280608a45daf88a", "042a7e7d043541df80c67bcec15446f4", "b28bcac1839f40c9bbad35d9fb200097", "58c4e346e525474fbba71c70f3bd6cad", "cee8ffcd16b2493cb8458986840da219", "a65d21f9625f46af83933ec2042ab3e5", "434fe781f92d4689a154b3a83805d7b6", "327a6db9ae884dacbaf4e8f01ce17bf2", "72d486fac0fe40ebab486abb4a04eb73", "7d3d9593905548db95a3a9502c73b975", "13280ebd271346729a9c008d1b1c99f3", "a0e6c24d816645a7938a328faa5f200b", "f9bd9fada56d4dceb8c5b81f6301ed54", "a47812a419fd4159954f88f3c8d86a3f", "a7c8cf6e63d24e919a43db38f2c929d6", "5cabaca6cdb741c48e07176284c51442", "773418ae872043c79cf6f02be18e1cd9", "5e13162a31e64bb9bc9423592b912927", "2a77f7d25e104e7a85c64a4d096fa44d", "02d7e0533c974e6bb7cafea9f5c1acb7", "731637a52f684a52b8a59aab4371d220", "7cb3612a8ae146d7947ef1c5c6f49a24", "b49cbd4a6fc3456a9eb6f9fe3d6f5750", "f0733f96389f461ca716d1a6fa377d8c", "f8eabf3c832e43249f17ef956c15fa46", "93af09dae02c42ef889a6d559d222e36", "dc0f6fb12ba34022895aba59068e3dcd", "4151fe27486b4d9d8701330779354212", "dc2a97f7ae8a4ae58edd9a2867f08d15", "294a9dc6efb5433f873d51bcdb1c1d51", "ce69221e780a4c30be9b9a2d751403e5", "71d6f794c0c94c20851c46ab23664329", "bc1ab537bdbe43d5bd046a9a7306abc3", "f5fc1b5e9f8c499eb5d24a7b08f5f08a", "31fdef5bfca44784b6431b840ff240c7", "aca89765c8694c499b592b525298f26f", "16eca963af3542bda1f00922f0b52864", "d9bf95b46ed74f7aa7a75100e9bc9b1b", "94d80a80d1854547823ea8c6eedcdec9", "0f57dc066c4844139c48dee8a8397b02", "5f9c24c933344841925212b3de823da2", "c866605ea7134187af73dd8a03fa2a70", "ee33bd5044514354be75cd64a34114bc", "1a3af4da3e0741eb84e6ee2e5e2bb518", "908cc2f389be46549236edbfc67e7bf1", "16f7b78b16aa4640996b52deed2ddf51", "ca03333b91274b5ea8e181a171975c2f", "17e0a3a3b6504300bbe69d50b7ec9e22", "3856b9570aef4cfda22dc673dc89176b", "098676604a4440718e35f2d9b1cc399d", "70d5f2f02c774feba6827f88ba2c05e2", "57b884b47d3a41bc9f4ec91198d3aba1", "dbd59e1c6a36464ca10bdb3ac2c53a25", "a42b4271349f41afab56f0264ffd5d73", "71075de912604b6686963e10dcf44cdb", "d884981f91534a42bf1485403b8a4dee", "8bf7e729882041829a19b32593fa7479", "0cdc0bc1063a4c559c61a09567d7c6f3", "bbf855628fb44d628d843a010e633e6c", "0e141689906c4225a9b878f04a2bbcdf", "37b438c0d59c41959b5068e161253b06", "aa022c70601a4725b80e0317d84aa36b", "db7a2e7851454d4d8fe3f6d7b743ff51", "ae24a75aafdd4646a146b51809ef54dc" ] }, "id": "TrHFFQjxwf_B", "outputId": "456531d6-c0b9-4c9f-d2a2-ae3bbfd7a01d" }, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "modules.json: 0%| | 0.00/229 [00:00\n", "RangeIndex: 19 entries, 0 to 18\n", "Data columns (total 6 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 source_doc_filename 19 non-null string\n", " 1 source_doc_id 19 non-null string\n", " 2 source_doc_domain 19 non-null string\n", " 3 document_text 19 non-null string\n", " 4 summary 19 non-null string\n", " 5 summary_source 19 non-null string\n", "dtypes: string(6)\n", "memory usage: 1.0 KB\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ " source_doc_filename source_doc_id \\\n", "8 OCR_ML4HLecture02image_.txt 67f6cc9a-83c \n", "1 ASRnlp_law_lecture_week_2_v_2_c_transcription_... 016e8d29-288 \n", "2 ASRnlp_law_lecture_week_3_part_1_v_2_c_transcr... 07af2cf9-15a \n", "16 script_strangersonatrain.txt 9e6bfae4-7c2 \n", "14 script_findingnemo.txt 04a90337-527 \n", "13 OCR_PAPER_Kandpal, Nieto, Jin - 2022 - Music E... 110b05be-f8d \n", "3 ASR-whisper-rpunctuated_Noam Chomsky, Fundam_1... fed834b5-a04 \n", "\n", " source_doc_domain document_text \\\n", "8 OCR \n", "\n", "Ezurich Lecture Machine Learning for Healthc... \n", "1 ASR I Just want to recap quickly what is already a... \n", "2 ASR I think we do reading as started. Thanks every... \n", "16 Script STRANGERS ON A TRAIN\n", "\n", " ... \n", "14 Script ----------------------------------------------... \n", "13 OCR_academic_paper \n", "\n", "MUSIC ENHANCEMENT VIA IMAGE TRANSLATION AND ... \n", "3 ASR_cleaned Well, I'd like to, in these two talks, I'd lik... \n", "\n", " summary summary_source \n", "8 The Ezurich Lecture on Machine Learning for He... gpt-4-0125-preview \n", "1 The instructor begins the class by offering a ... gpt-4-0125-preview \n", "2 The speaker addresses the adaptation to hybrid... gpt-4-0125-preview \n", "16 \"Strangers on a Train\" is a suspenseful story ... gpt-4-0125-preview \n", "14 The transcript details the plot of the 2003 an... gpt-4-0125-preview \n", "13 Nikhil Kandpal, Oriol Nieto, Zeyu Jin, and the... gpt-4-0125-preview \n", "3 The speaker explores the foundational aspects ... gpt-4-0125-preview " ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
source_doc_filenamesource_doc_idsource_doc_domaindocument_textsummarysummary_source
8OCR_ML4HLecture02image_.txt67f6cc9a-83cOCREzurich Lecture Machine Learning for Healthc...The Ezurich Lecture on Machine Learning for He...gpt-4-0125-preview
1ASRnlp_law_lecture_week_2_v_2_c_transcription_...016e8d29-288ASRI Just want to recap quickly what is already a...The instructor begins the class by offering a ...gpt-4-0125-preview
2ASRnlp_law_lecture_week_3_part_1_v_2_c_transcr...07af2cf9-15aASRI think we do reading as started. Thanks every...The speaker addresses the adaptation to hybrid...gpt-4-0125-preview
16script_strangersonatrain.txt9e6bfae4-7c2ScriptSTRANGERS ON A TRAIN\n", "\n", " ...\"Strangers on a Train\" is a suspenseful story ...gpt-4-0125-preview
14script_findingnemo.txt04a90337-527Script----------------------------------------------...The transcript details the plot of the 2003 an...gpt-4-0125-preview
13OCR_PAPER_Kandpal, Nieto, Jin - 2022 - Music E...110b05be-f8dOCR_academic_paperMUSIC ENHANCEMENT VIA IMAGE TRANSLATION AND ...Nikhil Kandpal, Oriol Nieto, Zeyu Jin, and the...gpt-4-0125-preview
3ASR-whisper-rpunctuated_Noam Chomsky, Fundam_1...fed834b5-a04ASR_cleanedWell, I'd like to, in these two talks, I'd lik...The speaker explores the foundational aspects ...gpt-4-0125-preview
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "summary": "{\n \"name\": \"df\",\n \"rows\": 7,\n \"fields\": [\n {\n \"column\": \"source_doc_filename\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 7,\n \"samples\": [\n \"OCR_ML4HLecture02image_.txt\",\n \"ASRnlp_law_lecture_week_2_v_2_c_transcription_2.txt\",\n \"OCR_PAPER_Kandpal, Nieto, Jin - 2022 - Music Enhancement via Image Translation and Vocoding-annotated_.txt\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"source_doc_id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 7,\n \"samples\": [\n \"67f6cc9a-83c\",\n \"016e8d29-288\",\n \"110b05be-f8d\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"source_doc_domain\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"ASR\",\n \"ASR_cleaned\",\n \"Script\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"document_text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 7,\n \"samples\": [\n \"\\n\\nEzurich Lecture Machine Learning for Healthcare 99 (261-5120-00L) Basics of ML for Medical Image Analysis Julia Vogt & Valentina Boeva & Gunnar Ratsch Institute for Machine Learning, Computer Science Department @gxr @gxrlab #DataScience #PrecisionMedicine #ClinicalData Li BPORRICHEs DINFK D BIOL UniversityHospital Zurich Gunnar Ratsch 1. 3. 2022\\n\\nElzurich Topics for Today Medical Image Data Typical medical image analysis problems Segmentation Superpixels Markov Random Fields Image Classification Convolutional Neural Networks Application in Digital Pathology WAi BPORRIHc INFORMATICS\\n\\nEzurich Analysis of Medical Images Pathology (2d, high resolution) Radiology (2d, 3d, low res:) Retina Fundus 2d high resolution Ultrasonic (low resolution, temporal) MRI CT bi %\\n\\nEzurich sZoo\\\" of Image Analysis/Labeling Problems Geometry Estimation Image Denoising Object Segmentation Depth Estimation Sky Building Tree Grass bi IVrUnIVIAC) Gunnar Ratsch 1. 3. 2022\\n\\nEzurich Image Analysis Problems Non-complete list of (medical) image analysis problems Image classification (\\\"normal vs. diseased eye fundus Image registration 'register multiple images of same patient\\\") Image labeling (\\\"find cancer cells' 3d object reconstruction (heart model\\\") Image segmentation (\\\"identify vasculature\\\" = more next page) Image analysis is a very broad field with many challenges. It would need its own lecture on that topic Actually multiple lectures. We can only cover some aspects and only some of the basics. WAi BPORRIHc INFORMATICS Gunnar Ratsch 1. 3. 2022 5\\n\\nEzurich Segmentation in Medical Imaging Determination of the volumes of abdominal solid organs and focal lesions has great potential importance: Monitoring the response to therapy and the progression of tumors and preoperative examination of living liver donors are the most common clinical applications of volume determination. MRI volumetry of the hippocampus can help distinguish patients with Alzheimer's disease from elderly controls with a high degree of accuracy (80%-90%). In order to be able to detect and quantify vascular diseases one of the first step is the segmentation of the vasculature: WAi BPORRIHc INFORMATICS Gunnar Ratsch 1. 3. 2022\\n\\nEzurich Segmentation Segmentation of an image entails the division or separation of the image into regions of similar attribute. Categorization of different segmentation methods: Boundary-based: optimum boundary, active boundary; live wire, level sets Shape Model-based: Manual tracing, live wire, active shapelappearance, M-reps, atlas-based Region-based: clustering; kNN, CM, FCM, fuzzy connectedness, MRE; graph cut_ watershed, optimum partitioning WAi BPORRIHc INFORMATICS Gunnar Ratsch 1. 3. 2022\\n\\nEzurich Superpixel Algorithms in computer vision use the pixel-grid as the underlying representation: The pixel-grid is not a natural representation of visual scenes, it is rather just an \\\"artifact\\\" of a digital imaging process: It would be more natural to work with perceptually meaningful entities obtained from a low-level grouping process. Superpixels are essentially the visually homogeneous regions of an image, that were acquired by partitioning the image into N regions, where the pixels within a region share some loW-level property (color; texture etc:) WAi BPORRIHc INFORMATICS Gunnar Ratsch 1. 3. 2022\\n\\nEzurich Superpixels Superpixels images of different superpixels number Ks and different distance functions (a) K = 50 (Euclidean distance), (b) K = 100 (Euclidean distance), (c) K = 200 (Euclidean distance); (d) K = 200 (Mahalanobis distance): (d) Source: L Zhang et. al. An improved method for pancreas segmentation using SLIC and interactive region merging Gunnar Ratsch 1. 3. 2022 WAi BPORRIHc INFORMATICS\\n\\nEzurich Superpixel properties It is computationally efficient: it reduces the complexity of images from hundreds of thousands (millions) of pixels to only a few hundred (thousand) superpixels. It is also representationally efficient: pairwise constraints between units, while only for adjacent pixels on the pixel-grid, can now model much longer-range interactions between superpixels. The superpixels are perceptually meaningful: each superpixel is a consistent unit; i. e. all pixels in a superpixel are most likely uniform in, color or texture. It is near-complete: since superpixels are results of an over-segmentation, most structures in the image are conserved. There is very little loss in moving from the pixel-grid to the superpixel map. WAi BPORRIHc INFORMATICS Gunnar Ratsch 1. 3. 2022 10\\n\\nEzurich Simple Linear Iterative Clustering [SLIC] Outline SLIC is a simple and efficient method to partition an image in visually homogeneous regions. It is based on a spatially localized version of k-means clustering: Each pixel is associated to a feature vector: (x, y) [Ax, Ay; I(x, y)] where I(x, y) is the pixel value(s) of the image at the given location, A coefficient balances the spatial and appearance components of the feature vectors, imposing a degree of spatial regularization to the extracted regions. Using these feature vectors k-means clustering is applied and pixels assigned to the same cluster will form a superpixel. WAi BPORRIHc INFORMATICS Gunnar Ratsch 1. 3. 2022 11\\n\\nEzurich Simple Linear Iterative Clustering [SLIC] Algorithm Input parameters Region Size (RS): the nominal size of the regions (superpixels) Regularizer (R): the strength of the spatial regularization The image is first divided into a grid with step RS. The center of each grid tile is then used to initialize a corresponding k-means: The acquired k-means centers and clusters are refined by using the k-means Lloyd algorithm. The parameter regularizer sets the trade-off between clustering appearance and spatial regularization, which is obtained by setting RS R in the definition of the feature 4(x, y). After the k-means step, SLIC optionally removes any segment whose area is smaller than a given threshold by merging them into larger ones. WAi BPORRIHc INFORMATICS Gunnar Ratsch 1. 3. 2022 12\\n\\nEzurich Resulting Superpixels Superpixels images of different superpixels number Ks and different distance functions (a) K = 50 (Euclidean distance), (b) K = 100 (Euclidean distance), (c) K = 200 (Euclidean distance); (d) K = 200 (Mahalanobis distance): (d) Source: L Zhang et. al. An improved method for pancreas segmentation using SLIC and interactive region merging Gunnar Ratsch 1. 3. 2022 13 WAi BPORRIHc INFORMATICS\\n\\nEzurich Image Segmentation \\\"7 Labelling Pixels Labellings highly structured Labels highly correlated with very complex dependencies Independent label estimation too hard It is desired that the whole labelling should be formulated as one optimisation problem: High resolution images: Hard to train complex dependencies Optimisation problem is hard to infer WAi BPORRIHc INFORMATICS Gunnar Ratsch 1. 3. 2022 14\\n\\nEzurich Segmentation as an Energy Minimization Problem Edata assigns non-negative penalties to a pixel location i when assigning a label to this location. Esmooth assigns non-negative penalties by comparing the assigned labels at adjacent positions i and j This optimization model is characterized by local interactions along edges between adjacent pixels, and often called MRF (Markov Random Field) model: WAi BPORRIHc INFORMATICS Gunnar Ratsch 1. 3. 2022 15\\n\\nEzurich Markov Random Field MRF is a graphical model over an undirected graph (G-(V, E)) positivity property (P(x) 0) and Markov property: Set of random variables linked to nodes: {x; EUR V} Set of neighbored random variable: N(x;) = {x; lj eN} Markov property: P(X; Xv-{}) = P(X; 1 XNv Pairwise MRFs: P(x) zexp(-E(x)) 1 E(x) Liev ~;(w;) + LievjeV; Vij(vi, Tj) WAi BPORRIHc INFORMATICS Gunnar Ratsch 1. 3. 2022 16\\n\\nEzurich Example: Foreground Background Estimation =0 _ iis in background (to be determined) X = 1 Siis in foreground (to be determined) Data term (i=1, _, n): 7;(0) log P(xi e BG) Probabilities are estimated using FG / BG Pi(1) log P(xi EUR FG) colour models (from pretrained model) Smoothness term (i, j-1,., n): Vij(vi, Ti) KijS(ci # 1;) Kij A1 + Az exp( B(I; 1j)2) Intensity dependent smoothness Looking forx * EUR {0, 1} n that minimizes E(x), with fixed Li BPORRISHEs backgroundlforeground labels Gunnar Ratsch 1. 3. 2022 17\\n\\nEzurich Foreground Background Estimation X* = argminx E(x) X This optimization problem can be solved by transforming the energy function into a min-cutlmax-flow problem and solve it (S-\\\"F\\\", T=\\\"B\\\") Max-flow min-cut theorem_ The maximum value of an S-T flow is equal to the minir capacity over all S-T cuts: Ford-Fulkerson algorithm to compute the maximum flow Energy optimization equivalent to graph min-cut Cut: remove edges to disconnect F from B Minimum: minimize sum of cut WAi BPORRIHc INFORMATICS edge weight cut B Gunnar Ratsch 1. 3. 2022 18\\n\\nz==0 Z=5 z=15 2=25 z=35 2-45 Our Method Ground Truth Figure 3. 11. Segmentation result for neuron with ID #39828 S Behrouz Tajoddin; M. Sc. thesis, 2011\\n\\n\\n\\nEzurich Topic 2: Image Classification 2*072 tnn 07135p Caltech 101 dataset Fei Fei et al., 2004 Gunnar Ratsch 1. 3. 2022 21 WAi BPORRIHc INFORMATICS\\n\\nEzurich Neural Networks for image analysis Neuron activation (weighted sum of inputs+bias) m Uk Uki\\\"i + bk i=1 activation function p X2 usually non-Iinear Input e. g. tanh, sigmoid and ReLU signals Summing junction Output Yk Synaptic weights Activation function output Yk p(uk _ Bias 9 Fonseca DJ et al WAi BPORRIHc INFORMATICS Gunnar Ratsch 1. 3. 2022 22 Wkl Wkz) 9 {0 Wkm)\\n\\nEzurich Neural Networks Hidden Layers Regular Neural Networks one input layer multiple hidden layers the more layers, the deeper the model #neurons at each hidden layer can be 1 different one output layer 3 3 One connection = one parameter Fully-connected NNs have a huge number of parameters. E. g, For input images with size 200x200x3, a fully-connected neuron in the first layer has 200x200x3-120, 000 weights. Neto LB et al Wi BPORRICHT INFORMATICS Gunnar Ratsch 1. 3. 2022 23\\n\\nEzurich Drawbacks of regular neural networks Huge number of parameters do not scale well to large images computationally heavy local minima during training overfitting Make no assumption on the locality of pixel dependencies the nature of image: neighboring pixels have higher dependencies than pixels far away regular neural networks are unable to extract local features using only global weighted sum => Convolutions to build-in 'locality\\\" WAi BPORRIHc INFORMATICS Gunnar Ratsch 1. 3. 2022 24\\n\\nEzurich How do convolutions work? Convolutional layers consist of a set of small-size filters extract local features from the input layer; or outputs from the previous layer 1 1 1 0 : 1 11 184/3 4 1 1/2i4/3 3 1/2 |3|4/1 1/3/31/1 3 |3 /1/1 0 11 1 1 0 1 I+ 1 0 1 1 0 1 0. 1/1|0 1/1/0 | 0 I Image K Filter I*K Petar Velickovic, Cambridge Spark Wi BPORRIHEs BIOMEDICAL Gunnar Ratsch 1. 3. 2022 25\\n\\nEzurich Convolutional Filter Examples | Smoothing filter F[v, y] G[~, y] 3 Original Blur (with & box filter) Note the edge artifact * Identity filter 20 '0 20 20 *0 '0 20 \\\"0 Original Filtered (no change) Note the edge artifact. Hundreds of other filters that have been developed of the last decades for specific needs, including denoising, sharpening etc \\\"2D Convolution\\\" WAi BPORRIHc INFORMATICS 26\\n\\nEzurich Convolutional Filter Examples II Examples of commonly used filters in image processing To 011 Original image Robert Cross 011 1 Prewitt 11 3 1 01 WAi BPORRIHc INFORMATICS Gunnar Ratsch 1. 3. 2022 27\\n\\nEzurich Convolutional Filter Examples III Examples of commonly used filters in image processing Original image LoG filtering 0[ 0]2 T[[ ~2 16 ~2 ~l12 F 0 ~l 0 Laplacian of Gaussian (LoG) The filter parameters in convolutional neural Wi BPORRIHEs networks are learned not pre-defined: Gunnar Ratsch 1. 3. 2022 28\\n\\nEzurich Convolutional Layers Images:| multiple channels (e:g: 3 color channels, RGB) Define window size, e:g: 3 X 3, 5 X 5, input dimensionality Chose number of channels k layer width Kernel weights parameters 2 (Sx, 6y, 3), i = 1,.. k, j = 1, 2, 3, 6x, fy EUR {:, -1, 0, 1' ''. } Feature map feature function applied to shifted signal kvector associated with every grid point (1, y) WAi BPORRIHc INFORMATICS Gunnar Ratsch 1. 3. 2022 29\\n\\nEzurich CNN: Single Image Channel Source pixel 0 0 0 0 0 0 0 0 0 0 2 0 2 2 0 2 2 2 0 2 0 0 0 4 0 1 0 0 1 1 0 0 0 0 4 0 0 0 24 08 Convolution kernel (emboss) New pixel value (destination pixel) WAi BPORRIHc INFORMATICS Gunnar Ratsch 1. 3. 2022 30\\n\\nEzurich Convolution details 1 R G B 1 L width ReLU CNN layer map n C 0i, j, (6u, 6 Ov) 8j (u-du, u-ov), j-1 fu, 6v 4-tensor 3-tensor Zi, ( (u, v) 3-tensor WAi BPORRIHc INFORMATICS Gunnar Ratsch 1. 3. 2022 31 1\\n\\nEzurich Convolutional neural networks C3: f. maps 16@10x10 INPUT C1: feature maps S4: f. maps 16@5x5 6@28x28 32x32 S2: f_ maps C5: layer F6: OUTPUT 6@14x14 120 %64 layer 10 Full connection Gaussian connections Convolutions Subsampling Full connection LeNet-5, Yann LeCun et al Convolutions Subsampling (pooling) Three main types of layers Convolutional layers (here, Ist convolutional layer has 6 filters) Pooling layers, also called subsampling layers Fully-connected layers bi BPORRICAE INFORMATICS Gunnar Ratsch 1. 3. 2022 32\\n\\nEzurich Convolutional Neural Networks Pooling layers downsample the representation size from previous layer reduce the number of parameters control overfitting max pooling 6 8 3 4 L 2 3 average pooling Types of pooling max pooling (most commonly used) average pooling LZ-norm pooling 4 6 6 8 2. 75 4. 75 3 1 1 0 1. 75 1. 75 1 2 2 4 L2-norm pooling 7. 3 10. 6 3. 9 4. 6 WAi BPORRIHc INFORMATICS Gunnar Ratsch 1. 3. 2022 33\\n\\nEzurich Convolutional neural networks Fully-connected layers have full connections to all outputs from the previous layer are usually used as the last layer(s) of CNNs S2 Input inage 10@ 5x5 2x2 C3 FS 60*60*16 56*56*10 28*28*10 S@ 3*3 S4 3@ 13*13*5 26*26*5 2*2 1xl*3 13x13*5 BH IN Max-pooling Convolution Fully-connected Max-pooling Cowvolutio Haj-Hassan H et al 192 192 128 2048 2048 dense 128 1224 den: Idense 1000 192 192 128 Max pooling 2048 224 Max pooling 48 Max pooling 2048 Istride of 4 128 Wi BPORRIHEs Krizhevsky A et al Gunnar Ratsch 1. 3. 2022 34\\n\\nEzurich CNN Vision Architecture Typical CNN architecture for computer vision pyramidal structure. Depth, lower resolution, many filters and fully connected at the end. AlexNet (2012) Input data Convl Conv2 Conv3 Conv4 Convs FC6 FC7 FC8 13X 13 X 384 13X 13 x384 13X 13 X 256 27x 27 X 256 S5x 55X96 1000 227x 227 X3 4096 4096 WAi BPORRIHc INFORMATICS Gunnar Ratsch 1. 3. 2022 35\\n\\nEzurich Visual Feature Hierarchy Layer 1 Layer 2 Zeiler MD et al Convolutional Layer 2 original image WAi BPORRIHc INFORMATICS Gunnar Ratsch 1. 3. 2022 36\\n\\nEzurich Visual Feature Hierarchy Fc Sh Cozh Layer 3 Convolutional Layer 3 original image Zeiler MD et al WAi BPORRIHc INFORMATICS Gunnar Ratsch 1. 3. 2022 37 'cogr\\n\\nEzurich Advantages of convolutional neural networks Parameter sharing in the convolutional layers reduce the amount of parameter and computation control overfitting Encode the spatial dependencies at different levels able to extract local features from the lower layers and more abstract and global features on top of the local ones Excellent performance on image classification tasks WAi BPORRIHc INFORMATICS Gunnar Ratsch 1. 3. 2022 38\\n\\nEzurich How to train convolutional neural networks? Parameters (excluding hyperparameters & architecture choices) filters in the convolutional layers weights in the fully-connected layers (The pooling layers are non-parametric) Input as much data as possible data augmentation: translation, rotation, scaling and random crop Depth the more layers, the deeper the model, the better Challenges long training time even with much fewer parameters than regular NNs overfitting caused by the large number parameters in the fully-connected layers (a common technique used to control overfitting: dropout) GPUs make convolutional models feasible Wi BPORRIHc INFORMATICS Gunnar Ratsch 1. 3. 2022 39\\n\\nEzurich Common pre-trained networks LeNet VGGNet ResNet YOLQ Classical CNN topology L VGGNet (2013) D-64 Fc GuNet D=128 D-255 D=512 Typical approach: Pretraining on very large datasets, then fine-tuning on applicationspecific datasets Finetune this D=512 0=1035 D-40S5 D=100J 224x224 112*112 55x55 2Bx28 14x14 FC FC FC Soltmax Reuse this WAi BPORRIHc INFORMATICS Gunnar Ratsch 1. 3. 2022 40\\n\\nElzurich Image segmentation with U-nets 128 64 64 input image tile output segmentation 8 8ll map 9 8 E08 128 128 256 128 8 8 8 aaa 256 256 512 256 8 83 80 2 512 conv 3x3, ReLU copy and crop max pool 2x2 up-conv 2x2 conv Ixl 8 8 512 512 1024 00 (O Y Lo 8 1024 https IIlmb informatikuni-freiburg delpeoplelronneberJu-netL WAi BPORRIHc INFORMATICS 41\\n\\nEzurich Take Home Messages Superpixels can reduce resolution without much loss of key image elements Image segmentation is very useful, but non-trivial to get right Markov Random Fields Traditional neural networks often don't work well on images due to overfitting Image filters are very useful & powerful Convolutional Neural Networks Build on filters> we can learn them deal with large-scale image inputs Parameter-sharing in convolutional layers helps reducing overfitting Pooling layers aggregate information to lower resolutions Convolutional and pooling layers learn feature extractors that are the input the the final fully connected layers Training takes long and can be parallelized (GPUsl) WAi BPORRIHc INFORMATICS 42\\n\\nEzurich Cancer Diagnosis Workflow Radiology Screening (X-Ray Triaging CT PET Surgeon Margin Status Surgical Pathology Diagnosis Staging Oncologist Treatment Molecular Pathology Genomics Wi BPORRIHEs Slides on Digital Pathology are courtesy of Gabriele Campanella and Thomas Fuchs\\n\\nElzurich Pathology Workflow Check-in Grossing Processing Embedding Cutting Staining Slide Preparation Slide Analysis Diagnostic Reporting Biopsy Wi BPORRIHEs 3\\n\\nEzurich From Pathology to Digital Pathology From glass slides to digital slides Better retrieval and sharing Opinion from other experts Opened doors for machine learning researchers Idea is not to replace pathologists but to make their life easier Automating redundant time consuming tasks Discovery of novel biomarkers WAi BPORRIHc INFORMATICS Source: https:Ilwww. leicabiosystems com/ APERiO IMERSA [\\n\\nEzurich Pathology to digital pathology H&E images: thin tissue sections (3-5 um) Hematoxylin and eosin staining Purple staining of nucleus Pink staining of stroma and membrane Access to different resolutions like in a microscope Image in highest resolution ~100kx100k pixels Image A Multi-framc Image B 1 Multi-frame Image EUR 63, 744 pX 3, 000 pX 1, 200 px 300 px Image Pyramid DICOM Objects Whole slide image (WSI): Pyramidal image Bruce A Beckwith, Digital Pathology, 2016. pp 87-97 James Cuff; https IlwWWnextplatform comL, 2018 LNi BPORRIHc INFORMATICS\\n\\nEzurich #Tasks at hand\\\" for a pathologist Gleason's Pattern 1. Small, uniform glands Well differentiated Bottom up: Prostate adenocarcinoma Cells: cell detection, cell typing Nuclear features predictive of survival, grading of cancer Example: counting mitotic cells Glands: detection, segmentation Shape and structure of glands important morphological featur Example for prostate cancer diagnosis Tissue: grading, tumor detection Eg: gleason score grading of prostate cancer tissue 2. More stroma between glands Moderately differentiated 3. Distinctly infiltrative margins Poorly 4. Irregular masses differentiated of neoplastic glands IAnaplastic 5. Only occasional gland formation Source: wikipedia WAi BPORRIHc INFORMATICS\\n\\nElzurich Image Data Deep Learning for Identifying Metastatic Breast Cancer Harvard, MIT (2016) 400 Camelyon Detecting Cancer Metastases on Gigapixel Pathology Images Google (2017) 400 Camelyon Classification and mutation prediction from non-small cell lung cancer histopathology images using deep learning NYU (2018) 1, 600 TCGA Pan-cancer computational histopathology reveals mutations, tumor composition and prognosis EMBL (2019) 9, 754 TCGA WIN RFORMAtics\\n\\nEzurich Large Image Data 470 ImageNet 14M images Whole-Slide Images WAi BPORRIHc INFORMATICS\\n\\nElzurich Expert Annotations ESt 02 2 WAi BPORRIHc INFORMATICS\\n\\nEzurich Goal: Clinical-grade Decision Support Given a WSI, return: Score representing tumor probability Highlight lesion location Campanella et al, Nature Medicine, 2019 https:IIwwWnature comlarticles/s41591-019-0508-1 WAi BPORRIHc INFORMATICS\\n\\nElzurich Clinical-grade Decision Support 1. Proposed a method that does not require manual annotations 2. Use datasets much larger than previous studies 3. Learn from the full wealth of biological and technical variability 4. No data curation is necessary 5. Better generalization to real data in pathology practice 6. Defined clinical relevance for computational pathology 7. Proposed a strategy to integrate this system in the clinical workflow bi BPORRICAE INFORMATICS Campanella et al., Nature Medicine, 2019\\n\\nElzurich Clinical-grade Decision Support 1. Proposed a method that does not require manual annotations At least one_tile_is positive AIl tiles are negative Multiple Instance Learning Dietterich et al. 1997 Campanella et al., Nature Medicine, 2019 WAi BPORRIHc INFORMATICS Positive Slide Negative Slide\\n\\nElzurich Clinical-grade Decision Support 1. Proposed a method that does not require manual annotations Ranked Tiles Tile Probability Instances Classifier Wi BPORRICHT INFORMATICS Campanella et al,, Nature Medicine, 2019\\n\\nElzurich Clinical-grade Decision Support 1. Proposed a method that does not require manual annotations Top-1 Tiles Slide Targets 1 Model 7 Optimization bi BPORRISAC INFORMATICS Campanella et al., Nature Medicine, 2019\\n\\nElzurich Clinical-grade Decision Support 1. Proposed a method that does not require manual annotations Evaluation Learning 105 in 1 4 weeks WAi BPORRIHc INFORMATICS Campanella et al., Nature Medicine, 2019\\n\\nElzurich Clinical-grade Decision Support 2. Use datasets much larger than previous studies 58Tp 12, 160 slides 2016 sign-outs @MSK ] 1. 9Tp 2. 5Tp 400 slides Prostate In-house CAMEIYONI6 IMAGENET Wi BPORRIHEs BIOMEDICAL 1884 Campanella et al., Nature Medicine, 2019\\n\\nElzurich Clinical-grade Decision Support 5. Better generalization to real data in pathology practice Prostate 1. 00 20x 0. 75 1Ox J 0. 50 0. 25 Scale Ensemble (AUC: 0. 989) 2Ox (AUC: 0. 986) 1Ox (AUC: 0. 983) 5x (AUC: 0. 974) 5x 0. 00 1. 00 0. 75 0. 50 0. 25 Specificity 0. 00 WAi BPORRIHc INFORMATICS\\n\\nElzurich Clinical-grade Decision Support 6. Defined clinical relevance for computational pathology Melanoma: 111 dermoscopy images 6 Algorithm: AUC = 0. 91 Dermatologists (21) Average dermatologist Sensitivity Wi BorMArtal: 2017\\n\\nElzurich Clinical-grade Decision Support 6. Defined clinical relevance for computational pathology 1. 00 0. 75 J 0. 50 0. 25 Positive Slides 0. 00 25 50 75 % Slides Reviewed Wi BPORRISzics 100\\n\\nElzurich Clinical-grade Decision Support 7. Proposed a strategy to integrate this system in the clinical workflow [ bi BPOREATC INFORMATICS\\n\\nElzurich Clinical-grade Decision Support 7. Proposed a strategy to integrate this system in the clinical workflow WAi BPORRIHc INFORMATICS\\n\\nElzurich Clinical-grade Decision Support 7. Proposed a strategy to integrate this system in the clinical workflow 1. 00 0. 75 J 0. 50 0. 25 Decrease workload by 75% Predicted Positive Predicted Negative 0. 00 1. 00 0. 75 [ 0. 50 0. 25 0. 00 WN 25 50 75 100 % Slides Reviewed\\n\\nEzurich Summary Computational Pathology Computational Pathology is data rich Prime example of using deep learning on medical images Proposed approach leveraged weak labeling of images Takes advantage of vast image archives at a large cancer hospital Proposed innovative ways to use methodology in clinical workflow WAi BPORRIHc INFORMATICS Gunnar Ratsch 1. 3. 2022 64\\n\\nEzurich Solving 'Tasks at hand\\\" using ML Cell detection and classification: HoVer-Net: Why: nuclear features predictive of survival, grading of cancer Nuclear Pixels Prediction Post Processing 70810 7Q80 OQUO 70 188 3 68 30 9 D JoOocool/ 237= Instance Segmentation UquO HoVer Net IDo 7 7 0ooos 1 Horizontal and Vertical Energy Landscape and Gradient Maps Instance Markers Input Image Horizonta and Vertical Map Predictions OOlooool Instance Segmentation and Classification Blue: epithelial cells Red: inflammatory cells Green: spindle-shaped cells Cyan: miscellaneous cells Nuclear Type Prediction Graham, Simon, et al \\\"Hover-net: Simultaneous segmentation and classification of nuclei in multi-tissue histology images. Medical Image Analysis 58 (2019): 101563. WAi BPORRIHc INFORMATICS HoVer-Net pipeline\",\n \"I Just want to recap quickly what is already announced to the class because we now have this beginning of room for everybody to joining persons. We will prioritise the impression teaching, but there are a number of students for various reasons who will be joining remotely. Of course, if health or other issues are relevant for you, please do not feel an obligation to coming frozen if you need to from home. That's totally fine. So to start, there is a number of questions on the module queue on a page that I'd like to answer. Imjus can I take these in the order of top of posting. So one of the questions is is it okay to copy paste from the note looks for the homework in general yes so you know you're doing it. It's good to understand. I Do not recommend copying and pasting and just blindly just copying the notebooks. It's better if you understand what you're doing, but so formal there's no formal constraint on doing that. The second question is homework emissions, so there's a number of them about what to do with the field. out, no books. The edge flow page has now been set up and you should be able to reach it from the course model if there's any questions with that. let me your after to now but there should already be submission pages for all of the reducing homewares and I think in ask the same thing. The edge flow had not been installed yet, but it should be up there now. so another question is about or these this? thank you for those questions but I think those are okay now. Okay, so could you raise your hand if you in the Computational Statistics recession only I Think with three students, we can not change it. Unfortunately the the attaches are recorded and so please just watch the recordings. And if you join at eleven at the end of the computationastatistics season, you could still be able to ask questions andsomebody asked if computer science bachelor students can count the credits as gas or minor courses and actually do not know about that So it would say ask or someone in registration for your department at experience be two of that because it was one of the guys who actually ask you only not looking that based people on Du Tesigns datchelers this force is ramble even as an agantion step as well as a go as a guest from basically depend on how you begin when you chose your when you chose to basically woman in store, future of future enron, an ex sex vacations of them that be used as a agitation scope or or compulercize nature if you show that there's a guest hop analysis over as a guest. Score Great Thank you. Relax Are there some other questions or clarifications about that? I Probably is not answer but check with your registration officials at your department and let me know if something if you need more information about that right and somebody just asked to clarify if the course projects and the district are counted differently and mean they are counted actively in terms of course credits. but I'm not sure if you can count them under different programs. from my perspective if you're department will let you then it will let you. I do not have any problem with that or right? I Thank you Dived for posting that you can coordinate this with the drink study administration and that's on the model if you want to check out. Answer: Okay I think those are all the you and a page questions. Are there any others that anybody want to bring up live to Asked: Can I do the course project in the summer or or is it does during the semester You're supposed to do it during the semester but it's not due till the end of the summer. But if you have any question about locals just set me a note role you can figure that about. Okay so this is related to the question about the homework what's due So on Thursday by midnight area will go over the homework homework one on Friday morning on the to session. it's best to submit the just the ipi and believe the Jupiter notebook few directly onto edgeof for their completion grades so we do not go through every homework and check them. But so you'll get full credit for completing the substantial completion so this is going to be targeted to combination you will just spotcheck radio, low spot check up and also a programmatic component if you have any questions about whether you are whether a specific programming assignment got for fighters or not a just a check with autism we do not like you know it it becomes available for late later you its past fails so it's just zero or one whether you is if you in our judgment and I think the homewerks are not that sophisticated so I think you can give in every section and more can try a good try even if like if there's an error that you just can not figure out just limited anyway and if there's evidence that you tried it then you get clean if there are there any questions or feed back about the first to session. does that a formate work for everyone at you mean should every person work on their own notebook? Yes yes So I mean for the assignments you can work as groups to solve it but prepare your own notebook and limit that on your does I answer your question. So for this data session on fairy app will go over as the homeortist to be ready and then also the week to notebook on tocanizatioand you can use the Qna page from on on model for questions in the tax session as well. So I mentioned this a little bit last week about the final assignment this is going to be first. It's just a kind of an exam you might say where it's just covering all the material of the course and it will be distributed a week or two after the class ends. It'll provide the exact dates with plenty of time in advance and it will just be based on the ladies and the required readings. It will be designed to be pretty quickly two hours to complete, but you'll have a few days there. Four days may be a week to complete it so you can schedule this around your other obligations at that time and we will do some review and will show you some sample questions during the class or so. Last week we started with collecting and cleaning corporate and doing a quantitative measure from corporate and also a dictionary methods. It wanted to introduce an example of a dictionary method paper from my own research to see how these dictionaries are designed to solve social science problems. So this is joint work with David So who is a student with me here as well as Art at Being At Work and warm rural at all. And this paper is motivated by the recent discourse about race in ethnicity, issues of discrimination and prejudice in equity as a policy problem and does that policy problems motivate that there of above more research about this issue And there is this anecdotal or stereotypical understanding that Economics as a social science compared to political science or Sociology has done less work on these issues and previously this was just in people's heads. But in this paper, we actually look at this eventuality by applying nil up methods to the text of scientific articles. So we built a corpus of five hundred thousand academic publications from these three disciplines, and we used a dictionary approach to identify against the relevant articles. and so please read the paper if you want to see all the details. But I Wanted to note a few highlights just as an example of the dictionary methods approaches that we started discussing last week. So first we considered all publications in the journals that J Store characterizes as comprising the disciplines of Economics, Sociology in Political Science. We also topped up this corrupt with additional articles from the Scoops data set and also the Web of Science data set. And so the goal was to get close to universal coverage of published articles from and Beaten and Sixty up until to Thunyad Twenty In that end up with a half a million in publications. So this is exemplary of this first step in social science research, which normally in an opinion class. You do not worry about this right you just take the courses given, but in order to answer questions such as how much have the different social science disciplines been spending on these issues of discrimination and so much time have they been spending on it, you might not be able to answer that immediately because the data and does not exist. This shows that building data and cleaning it's this important part of social science applications of us even before you get to the modeling part. So read this live can you like? But I just wanted to point out that our dictionary method is actually quite subtle in the sense that we we be matched a number of patterns. so we have a number of synonyms which we which we refine by looking at examples of different race inathnicity groups as well as related topics. So we did not just look for the word black because that light refers to the work to the colour right and so in addition to requiring the mention of a research group is also required a mention of of a topic of discrimination, inequality, prejudiced bias, historical issues like slavery or him cargo so that in order to yoincrease precision of the resulting dictionary based classified identify articles that are had do not just mention these relevant words but also have a related a substantive topic and shown we did invitations of articles. This turned out to help a lot because often times you'll have an article that's about you now minimum wages and you'll have like in a system abstract describing what the paper does and then it's only the last sentence that says and then we looked for heterogene eighty by white, black in hospital that's actually quite pharmaceutical and so we wanted to focus on articles that were very so typically about the issue of race and acidity and these different subtle revolutions and refinements of our dictionary base method allowed us to do that. These are the main results from the paper showing that even though Economics in the top left, economics is in blue, political scientists and green and Sociology is in red. Even though Economics has the most papers, it has the least papers about race in ethnicity issues and Sociology has the least papers but the most papers about race in activity issues and so this anecdotal since that sociology especially but also political science are paying attention to these prejudice discrimination in equality issues that been anecdotal since that kind of conventional wisdom turns out to be right at yes that's just because of coverage. actually they're not in the database so if you so probably to get higher quality data is that we should have finished around twenty and fourteen or so. But it's because it takes some time for all the journals all to their articles to get into the day to base the easing of. But but if you included everything for real it's going on Still is even speaker destroy the way the number of range related obligations who then do through thawing wanting worse or it's journal quality weighted the question. We should put populist that into the title, but it basically multiplies the number of articles by the impact factor of the journal. So basically journals, they get sides often, so they're kind of more influenced on the discipline. they count more on the graph and so this is really just. this is mainly a robustness check more than anything but we. He wanted to check that it was not that Economics might have fewer publications, but they're much more impractical. We can rule that out with the bottom left panel only that's dictionary methods. Now we're going to get tobasically the whole. Most of this lecture today is about taking documents and transforming them into restrictions, and you'll see that there's many ways to do that. Almost everything that we do in this chorus is about document representation learning, a numerical representation of plain text documents, and the process of taxation, which is nsegmenting the document like breaking it up into pieces like paragraphs or sentences or words, or where pieces or letters. That's a pivotal step in that process. So just to summarize what we're doing, we start it off with some plain text documents and what we're doing today is converting notes into tokens, and then one of the workhorse document representations in blip historically and also still today is immigrants which is basically it phrases. So we a document goes from plain text to account a frequency distribution over two or three word praises and is be going to try to keep this notation consistent throughout the course. of course some papers do it differently so it do not be hundred print. but will you use capital detail referred to the documents in capital way to refer to tensions like lists of words and capital x will be some market representation or frequency representation constructed from the tokens. So to summarize and you when you are reading newspapers and somebody asks how did they recognize the documents and why these are the country factors that would be relevant for us so they need to be informative or predictive for some task be text classification, doing a topic model, training, word embeddings, or building a language model they should be. This is one of the goals that is often not satisfied, but ideally the tokens are interpreted so the holding everything equal would be useful to be able to count the number of times each word it's mentioned rather than each letter. If you just had a representation of a document that was counting a number of times each letter like G X C L was was mentioned, you would not be able to look at that data and understanding anything about the document was about. But if you could look at the talk words or phrases in a document then that will tell you we' much more interpretable and then finally treatable. So this is becoming less and less of a problem as computational resources increase. But it's still the case that you know you have a million sentences in pose and you need to compute the paradise similarity between the sentences. Let's say I want to have a million sentences I want to know how similar are they to each other? That's a million times a million comparisons, right? So how those sentences are and presented will be really important compucationally. So there are two broad approaches to this one. if you might call in the standard or classical approach that is from pre neural nets or at least pre recurrent natural nets blip where documents were represented by counts and by the longer sequence longer and sequence information in the document was removed. And then there's a more recent approach which is to maintain the sequential representation documents so you can imagine in Tea. In the first approach, you take a playtext document and you get some counts over different words. In the second approach, you get a list just the origin, all words you take in the whole document as data rather than accounts or clear over vocabulary. This is a no kind of Opstrak, but we will see examples. Here's a askematic for the first kind of of recognizing pipeline, so this would be a way to do this in Python in ineltik. The first line reads in raw hotel text from your website. You can clean out the hotel using the number of approaches such as a beautiful soup. take some snip it of the data tokens would be Taking the raw text and splitting it on the space is what it normally means. You get a list of words rather than strings and then for example you could say you do lower that will give you the lower case that will start doing so. preprocessing, putting everything in the lower case for example. and then the vocabulary at the end is just the set of its heat of unique words. In the purpose in this process of tocanization, you can think of it as building a degree. arguably most of the time it's kind, the space of possible representations that a document can be mapped into. The second kind of tocanization as used in transformer models. This new generation of help. Most of the time they used what you would called subway tocanization and the form A practical standpoint for this type of recognizing this standard type, you probably want to use spicy or Ginsim to do that. Those are currently standard tools that have like these standard recipes for doing that or Psychic Learned Tide Factorizer Back to for this type of toconization you want to use the Hugging Face toconizer. So I think after you are ready introduced that hugging base system in the tax session. So we will be. For those of you who are doing transformer based help models using context sensitive help, context sensitive embeddings, then the hugging Face Stadium is something that you will want to invest in in Learn Houdworks. So for example for Bit which is that is guess to' kind of one I just be the work horse for short document in all using Transformers rather than a vocabulary that includes all words, they have a vocabulary that includes subwords. and so for example you a word in the best vocabulary. it could be three words or four words. So here you it says playing could be reduced to play an wing. And so that's why it's called forward Tocanization because it would take the two world pieces and treat them as two separate words in the vocabulary. We'll come back to this. So before we're getting to this token representation or either list or or counts over tokens, the first set of choices that you'd want to make in developing good data is preprocessing the text. So there are many steps to this process potentially. And as it mentioned, there's a number of recidities such as a physicist learns stiff facterizer or Chinsim preprocessing or or some of the space functions that will make a lot of these decisions for you. But it's important to think about these because before example, whether you move, capitalisation or punctuation could make a big difference for your downstream outputs depending on your task. So the first usually is taking full documents and splitting them into smaller pieces so you might want to take the one that often uses letting into sentences and as a mention of a you have this task that's doing pairwise comparisons between a million sentences you have had to get the sentences in the document first right and so spicy I think would be a standard sentence solution. so when you input a document into space it will immediately do the splitting and it will adjust for nperiods at the end of more or messes or abbreviations on us a porch things like this and you'll get informative set of sentences till work. A lot better than just splitting on periods or full spots. In terms of splitting paragraphs and documents, there actually is no standard solution to that because that's going to be a quite domain specific how paragraphs are split. If you are using hotel there will usually be in be the pop tag or the bar tag and if you're like in digital documents then you'll have to have a costumed solution. I will tell you that for most digitized documents like over a line that ends with the period or it full stop orquestion market explanation point that's almost always the end of a paragraph so it do not be perfect but you can use that as a short cut to split paragraphs using our data. Just align it into the period and you know part of this breaking up process. Preprocessing is the idea. Threats is something that will repeatedly come back to is that unstructured text date has lots of noises, is a lot of extra information that is not useful and so we need to develop our pre processing and featurization steps to extract important information and exclude the irrelament. So of course theatres many papers about this, but the dining supporting paper, for example. They undertake a number of systematic investigations of different pre processing choices and especially for unsupervised learning. So remaining a topic model or clustering things like this, the preprocessing makes a big difference. Fortunately, this paper also shows that for supervise learning, classic machine learning, classified fiction, and regression, the pre processing choices do not matter as much as long as you have a sufficiently rich representation of documents. Text class birds will work well, so are choice is whether to remove capitalization and so usually the capitalized and non capitalized version. This is everything that going talk about is mostly about English, but you can imagine that there is going to be similar or parallel issues in garden or other languages. The capitalization issue is think even more nsomekind. It's like more interesting in garden, but in English of course knows are only capitalized at the beginning of a sentence. Risk in titles and things like that and so often times you increase the size of the feature space by treating capital letters differently and it's usually better to remove the there are so many exceptions to this rule. So in the legal context you can think of the First Amendment having a capital effort. Capital A Three is deteriorating about American law. So the First Amendment refers to Freedom of Speech and religion and things like that. And if you read the phrase of the First Amendment without capitalization, you know that they're talking by about you no specific law or specific contract or something but have the capital to capital as they're talking about the Bill of Rights to the U's Constitution is and so that will be an example. For legal documents, including capitalization could be pretty important. Also you of course if you're doing linguistic initiation of documents like part of speech tagging statistics, passing semetric role labeling, capitalization is really important for that. What causes what you might often have is the source documents are not capitalized correctly. So in the case of contracts for example, you'll often have sections of contracts that are all caps that all capital letters just to lie, highlight them. And and that's a problem because like things are a part of speech tagging and a synthetic parsimple break on on text like that. so you might need to do some custom checking. This is a nice example where punctuation is quite important. So I got these from Chairs Bail's slides. In general the rules about punctuation whether you should keep them or not. It's kind of the same as capitalization where usually they're not very informative if you have immigrant representation of your documents. but if you going to use your data for some linguistic invitations such as sentence splitting or part of speech tagging things like this then you need to keep the punctuation and information in your doctonants similar for numbers. Most of the time you should remove numerical information from the document just because if you are breaking up your document into matrix accounts is just counting that. How many times the number one mention you not going to be very useful or especially if you're counting like the number of times the word the number nine hundred, nine, two thousand and one hundred is mentioned that do not be very informative and no would say replaced numbers with a special character like a hashtag would you be a decent approach for language models like get two gptthree bright numbers are just treated the same as any other sea are and so they'll be part of the subdued tocanizer and will get to language models in weak nine. but this these big language models like God there actually can not solve mat problem. So if you give us some tiny pleas to plus two equals for two plus seven equals in nine things like this and give it a new problem by will actually often times provide the the correct inverse. This is particularly why we're doing this kids right now because this amazing technology of language models is transforming how lawyers and social scientists can use language as data and on for practice. But the this really exciting and intriguing finding that language bottles can solve math problems. It means that they're starting to have this kind of conceptual representation of language under the hood, but it still does not work very well. So it's easy to find math albums that get there does not can not solve. And so this is an active area of research and there are many projects to do for having language models understand numbers. And as a side note to that, having language models that can do things like fact checking for example, scientific claim verification and having them understand numbers is going to be a really important piece of that. You can see how this is practically important. Dropping software is a similar type question as punctuation and capitalization. There's a standard list of words that show up all the time in English, but do not really mean that much by themselves. On the other hand, it's easy again easy to think of counter examples in the word not is often troops a stop word. But if we're reading legal documents and we're trying to say you know what is the judge deciding having the phrase not guilty is inappropriate to include right and then just more generally in the law and in other technical all domains. Oftentimes specific phrases or means are going to have an independent and pivotal meaning beyond the words that are composing them. So like the classic example in America in law is beyond a reasonable doubt. This is like a very specific evidentiary standard in criminal cases. and the words beyond reasonable doubt by themselves. If you counter those, those would not be that informative and with all deliberate speed. That's another procedural phrase that's very important in the U's law. And so you can imagine that even though those those phrases has contained stockworks, right? So if you drop stockwords before doing in gardens, these would not come out in your future representation to one option here you're practically speaking would be to drop software when they appear by themselves so you do not care how many times the word a shows up. But if they show as part of these phrases beyond a reasonable doubt, you would keep them in the future representation. Another way to you refine the corpus to make it more informative is limiting or limiting. and so rather than include every word that shows up on its own, it could help to drop surfaces. So in English this would be to consigned consigned, consigning consigned with all four of those things are talking about the specific word route consigned and so applying a swimmer will remove all all those suffixes following a rule based algorithm porter sometimes I think we pretty well in most cases and there is something called limiting as well which spicy will do for you which rather than just split take off into the sentence, it will actually look it up in a dictionary and give you the world route only. So this was going to be designed for the hybrid zoom chat but lets lets just do this in person instead. So just to start practicing with these issues of of non word or style features in language, consider the following four dimensions of language based on the first letter of or last name. So read either if get disturbed based on your last name, think of a social science analysis or important legal or social dimension for example or judicial opinions for newspaper years for social media for political speeches, think of something that's interesting to you, perhaps related to your work or other things that can be either can be measured by capitalization punctuation would change depending on the use of software or we change depending on the use of simulator limbertizing and so we're actually at the the point of the break so is Let's think about this now and you can take a few minutes but also take a break and we're going to resume and a fifteen minutes a ten actor the hour and will give some examples of will have people to volunteer some examples for these categories so will see you in fifteen minutes stmepo degree to at it on the alright we're going to resume now so can I get a couple of designs from those where the last name we direct for something that can be measured with capitalization. Yes annual air last name starts with off someone who thought about capitalization being a relevant legal or social science point to deal animal which for you and do you want to start us So I'm asking for some volunteers for these four groups. So what is your last thing to start with? So great. So you are in the first category then did you think about this you come up to an idea for something social science on social media, law, political reached where capitalization would be this important dimension. Let's okay we havepouned the back here yes at and that's interesting right? So you can imagine you people who have the more years of schooling. maybe they have a longer's longer a more complex language it has or fewer periods but more cocombas for example interesting And the abundance of pertulatory events using doctors bacors be cause more along plots heat this shorter grade umation interesting right? So you can imagine like in liked transcription of patient's speeches. if there's more like dots that they're breaking up their language more because thneither'r taking more extra both is like that. That's interesting yes and at often the J owl are done although it are a simple passport and analysis version by the biological and that so just an etaspoto to the court other is so and what was the role of the porter semester in your approach analysis there can be built that would plate time analysing, forge in on and so porter sometimes I guess it depending on what you're in your context. the swimming was worth while because the world endings did not have as much information is needed. Is that fair to say He I see thank you any other exact yes constitution age weights that like the First Amendment ex ample or like you's constitution article there and do not be among number is for example woman numerals are often capitalized and so like luck it is going to be different than just looking for lower case and so interesting at at to can topecifoe report on a rise to iyaitionsa to get and of new truly vote changers sometimes right But I think that ye turkish and maybe Japanese like this too much to but through examples sometimes like a whole synthesis in one word right like you could have like addition or her like tin surface saves is one thing. So why you won you very careful what's standing in Turkish so I think that's a very good very good. totally once here and or son the second cussificationak shout interesting Totally right yet so like tweet the yealternating fur lower or K R I d' I did not put it here but I guess you could include a mode jesus punctuation as well about thinkffor a syntomate classieria all cap it's going to make a big difference. you have a well at of routine on tree was root seeing airport announced special rises of interesting totally right. So like all naninnity recognition for example. so if you want to who you let them second but like if you want to in English especially when things are capitalized they have some kind of special properties to get them of personal about. for example procedure for it is important another point there yet he and interesting right? So that's a interesting thought right? So you could imagine like there could be some systematic methodical way to see know for what words are, timing or limiting capital that's a good project idea potentially to right. Let's move on. Thanks everyone for those on the zoom if you want to add examples please type of Imachap. Also now we're going to move forward with recognizing the most basic unit of representation in a text is a token. So thats what were going to refer to as when we break documents down into pieces those pieces usually words are tokens and so you could imagine. One simple way to do this would be you represent documents as a list of characters or letters. The standard approach would be splitting into words so you go to like a split on the space and then the third would be immigrants or phrases so we capture local world ordered as it for example with direct becomes with sir So when this is like the kind of classic workhorse in help would be a backwards representations. Basically just a corpus is broken down to a vector and for each word in the vocabulary you also get a number corresponding to how many times that word showed up in a document. And so you know just to introduce some languages here which we'll use repeatedly. For a given word in a vocabulary we can say we can have the document count which is the number of documents that had that particular word tocentype appears in a in the curbs and then term count would be the total number of appearances in a corpus. In then we can define a term frequency within a document as the number of times a token type appears in a document divided by the documentligso there's the number of words in the document and so sometimes that we might be a bit imprecise, but going forward if you tried to use the word counts to refer to integer, the number of times it competent occurs and frequency will be the share are the count divided by the links. So as an application of this in the political realm Monro and all or this is bighton words paper which is linked on the bibliography they recognize congressional speeches and ten identify words that are distinctive of Republicans and democrats in the U's Congress. So to do that first they run a topic model late to reached directly allocation which we will talk about next week to identify speeches about abortion, reproductive rights. So that's what is. They use a top model for purposes sick so this is it. Think pretty useful in the congressional record because it's very diverse what they're talking about. Sometimes the speeches are just like procedural there is saying we should not on air, not on the and those are not going to be very interesting politically and in the paper that they provide a number of ways to identify language that's distinct five of the different parties. so you know the Democrats at the left wing Republicans are the right wing party. This is from around to thousand and five I think they hundred six Congress and so you can see that when you just looking at the difference in the proportions of how many words are used, the most democrat word is to do it's a stop word and the most republican word is the T. And so this shows that this very simple metric of just the difference between the frequencies is not. There's not to a very good job of extracting distinctive tycoons. The second approach that they do is compete the law of auzry. Go for the words. This you would think this would help a lot because as racioit adjusts for the proportions of other works that are used and within this actually or living works. even worse because the most democrat phrase is bankruptcy which has in snow which have nothing to do with abortion right. So this is not extracting an interesting dimension of partisanship. Then you can look at the paper for the statistical the mathematical specifics of this. but they provide this interesting multinational lesbian model underlying language. If they specify that model and estimate the associated parameters, you get a much more interesting ranking of the words. So for within the topic of reproductive rights, democrats talk about women's rights for example their decision up to the family whereas republicans are talking about abortion procedure and killing babies. So you can really see how this kind of like difference in the difference in the framing of this topic really comes out once you try to these other token drinking methods and then finally this one they had a regularisation parameters to really to shrink most of the word premieres to zero while still maintaining this distinctive language. Yet that's great. Yes so not now. and I think that's a very good question of this specific ranking of the terminology and their sporting well in this context, but it seems quite specific what they tried so we actually do not know. they might havetried a hundred other things. So in terms of exciting we do not want to draw from this we sold always apply this method in every context, but in in this course you you know when we're when we're doing the required readings, presentations who were running the response essays. That's exactly the type of question do you want to ask the morons and their co authors they wanted. They're also probably selectively presenting this right and so some other questions would be used so I didnt even include. that's a good one right. Will this work in other domains? We actually do not even know if it works on other topics, even within the congressional record. They only try their reproductive rights at the abortion topic and so get these nice ranking of the words that we kind of know intuitively that if we want to extract this of slain dimension of reproductive rights discourse that it worked. but if it was another topic where we do not know such about, how would we know to trust the results So so waiver this is kind of. It's lucky that they chose this topic because the results are indicative. So in another domain where we would not have a lot of previous knowledge about the topic in mind to work and we might not even barely equivalent in another question or comment about ewhyheard some words so out of that supporter batter. So when you do a porter simmer on words it will replace them with these kind of placeholder suffixes. So words that end in why given ending in eye. So this is because like baby or bad they both become paid and but that's a good question. Do I about the presentation of this right? Maybe they should have used a limitizer instead because it makes it were to and of have to read the paper to even know it. but that's a fact. So I think and that's kind of related to you known they were saying that the difference in the frequencies do not work. The mean if they had just dropped stock words beforehand this would have looked okay right because they just dropped software. Then you can say women's rights their the Democrats abortioned baby procedures and publican that would actually already looked pretty good. And so this is the type of kind. Like what else could they have tribe alternative methods to get it the same place? That's way we want to start speaking about the you mean on horizontal axis it says frequency of word within topic. So if I think it's just how its the log of the of the frequency of the words. So but I Do not think it played a big role in our analysis. I Think it was just a way to add more space on the graph. So let's not road these types of questions. Okay, well, you know, is this figure effective? What else could they have done? What information is being concealed or left out? That's what we want to ask for. All these applications appears okay. So building a vocabulary. As already mentioned, this is basically the goal of frustration or tocanization. We need to think about. Well, how big is the space where our documents are being represented And so no. there is number of ways to do this and depending on your context it might not be do not afford it. but there's just kind of simple historical that usually work pretty well. So like any word that shows up in less than ten documents, for example, you know it's not going to be very informative for any analysis that you undertake, so you should drop it from the clear. probably. You could also impose a more complex threshold. sigh you not needs to appear twice in over twenty documents. I Think there's kind of limited returns to this, but I Like the ten document minimum or take the top twenty thousand most frequent words is usually a decent approach. A more subtle way to rank words is in the frequency is called to term frequency inverse document frequency waiting. So what? this does is in addition to avoiding information about how often as a word or a phrase shows up, that's weighted by how often it shows up across document in the whole countries. And it's inverse document frequency waiting in the sense that words that show up in all documents should be downweighted because they're not very informative about topics. So if this will kind of depend on your task. but let's say you you're trying to classify documents on breeding topics or you're trying to learn topics accidently at a topic model. The words that show up in every single document like the or a those are going can be very informative and inverse Document frequency waiting is a way to address that. And there's some example text at the farm on this slide to get a littlebit of intuition, but basic by this will upright distinctive words that show up often, but not in all documents yet. I Think that they have like Enpsycli Burn for example you would put one plus they smooth. there's a good stomping farmer btyso this is like one example formula but as you said it would be undefinend for some works this way. but they they add might made partner in this danger of implementation. So this is the the psychic learned implementation that is talking about. So there's a few different options but I think in the default one it's one plus the log of the document count. To address this issue. The intevenominer. But in terms of life for most tasks especially for doing it classification task this is your friend of if your corpus will fit into memory run stiff vactorizer on it to transform the words the plain text to earnings. When do you're ready to go you have a data set already that can be used for anything and so in terms of the preprocessing options it will actually already remove accents. Things like this remove capitalization and drops up. Words is the question at so the the idea Let's say let's see how this a regression model and am packed at up. Let's say you this will make more sense when we're doing were like comparing documents so let's say it will do this next week. But you want you want to compute the distance between two documents and if you when you vectorize documents as frequencies over words you might want words that show up and all documents to cut less and so if if you use this transformation you're basically re scaling the dimensions of the base so that words that show up in all documents means there not very distinctive you downright that dimension and so he think it's an important question because the while this will matter all for concise coinkedocument distance for other downstream tasks that actually do not make a difference and lets but I want to come back to your question next week. Buyout On think of it as it's releasing the corpus so that releasing the corpus representation so that dimensions are words that show up and all documents count less in those distance calculations. So stiff Vactorizer the is waiting. Its its optional so you do not actually do not have to do it and there is options for something. The if waiting on things like this time I don.' There has not really been enough kind of systematic comparison of whether if waiting helps or not but this will like only be task specific. so there are this. representing documents is like counts over words or counts over in brands is really the standard thing to do. Counts or frequencies and you can pick up an infinite number of other things to do right. You could say well in actual I Want take the log of the frequencies or just an indicator like for whether the phrase appears or not. Quadratic paradise interactions these winners often not done because of any given like text classification or top of motor polemhis just adding dimensionality without helping and there are not any kind of rules of them for beyond getting the ignorance frequencies. what else? What other featureization we should travel. I think immigrants is the way to go. and so just to talk about immigrants a bit more, These refer to converting lists of words to lists of phrases. so you know for this is a sentence, you have a number of bargains like this is is a sentence and then atrtrigrams are the three word fishermen. That sentence it is a sentence and this is useful for oclassifiers and things because it captures the local word order and I mean you can. This is a sentence is not a very good example, but like we talked about, you know the word tax versus tax cut or tax credit. You can emit like these two word phrases really in of a lot of important information in legal and other documents. and so this is why you normally use infants. and there's this link on the cillabus is that this is that code may be a little bit out dated now, but I think most of it is probably still accurate with this Google developer's text classification guys. They tried perturbing some of these different choices or use of standard text classification tasks and when you have long documented or relatively few documents then stiff weighted bargains is the base lie. They recommit which you know because there are so many different things to choose. This is a very satisfying to hear. I Think there really simplifies things that if you and the even they even give you like this. these told that if rose the number of rose divided by the document length is less than fifteen hundred. used tide water bridges and so they tried like words for his characters immigrants versus sequence which we're going to talk about later just bygraundi usually enough and then if I did and so this simplifies our or task. How many managers do you need in your vocabulary if you imagine that there's a hundred thousand words in the English language, the set of possible toward phase hundred thousand times on hundred thousand times to and so that is not going to feed in memory prison. The same Google Developers guide they recommend picking were thousand bygrands based on their frequency which I think is actually a really decent rule of them And so in the physicist learned stiff facterizer you can just say I want bargains and I want twenty thousands of them and it will do it automatically based on frequencies. but even twenty thousand any thing is in marriage cases is more than you need I've just like tired this in different applied classification tastes and even like to thousand features is often enough in adding more as diminishing returns. But I think that and well talk about feature selection and a bit for the if you're doing classification old let's sixty thousand immigrants based the frequency and a du feature selection to select predictive features down to ten thousand. A totally alternative way to get at this issue of high dimensionality is bashing bacterizer at E all year. Good question, right? So that probably seems a bit inconsistent with what I said. is it can to go about of painting rat? So what I would recommend is in general, you should include them frequent including those frequent words and what it would normally do is any words that show up in more than like forty percent of documents those get dropped and then after that take the option. Thousand Bees formulate this. Another historic would be drop the two hundred most frequent words that is usually actually a r by document frequency. So that's like kind of a representative of these southward usual words that are not very important, the most corporate, drop the top two hundred in the ten thousand flat. So but I think that the idea is that are the most informative words are the most frequent order, not that are formative and all the very internet words are also not that informative. So in the middle and a year we also look out at the dipran Yes, so there will be very few of those. so to be a bit more precise I would say dropstopwards and words that show more than forty percent of documents then make bargains from the remaining vocabulary and include the top to on tousan on that probably has. Not only are when we get to partings ansyntact and sometimes is to date because the right I show you an example earlier. Beyond a reasonable doubt that's like a forward phase that contains a sword but you need it to get the meaning, let of that. So these are kind of rules at Dumb and and it's if important to think the ways where they find not work for some cases is her on what performance did when need breeds or so the best. The most systematic one that I know of is this one Here is at least for text classification right they found they did is own array of stated text classification datasets and for the biogramds work as well as strike so we'll see an example later. This gains crown should appear at two thousand and ten paper at least in terms of interoperability like kind of the quality of the features. The programs are pretty nice in terms of getting phrases that are saved like in important narrative or political information in them. The programs were nice, but I think that for you just the quality of classified bygrands are normausually as good or there's diminishing returns as you had not at ignoring pills but another way to kind of dodge this issue is the washing vacterizer because with a washing vacterizer you can add even an arbitrary length of immigrants and they all get just mapped to an arbitrary idea. So has anybody heard of this before today So you know washing is this you known way function technology thats often used in cart criptography Right where you say basically you can take the string output and output a lower dimensional string and there's no way to get back to the original string. But that function is deterministic. So basically you can imagine making string and mapping it to a number. That's what a hash function does. And so I can build a hash function that has ten thousand buckets in it and any string will be randomly matched to one of those ten thousand ideas. But once I have the washing function, it's deterministic. So once if I have a washing function, I can then vectorize the whole corpus to a vector of ten thousand items. But what those individual strings include could be anything. so it could take words, Bigrams, programs, and quadrans. All of them get mapped to into this ten thousand dimensional hash space. Kethis is the illustrating it here under traditional vocabulary construction and specify the number of everywhere. so bell cats and dogs. but with the washing trick, they just get a random number. but that number is always the same so it's comparable across documents because there's collisions. So just by chance, the same number at the end of the chasing function represents multiple words. so that's what this is trying to to represent here. This is called a collision in the sense that if you only have ten thousand items in your hash function at in the output space, but there's an infinite number of strings in the input space so You can imagine that if you have ten thousand items in your hash function, two words have basically one out of ten thousand chance of being up to the same number. and so every when you're going backwards, every number at the other hash output space represents a lot of different strings. right? And love. Tune in your purpose. And so that's why the fact that this would never happen in practice, right were two or superrare that two important words would be mapped to the same idea. The way that you address this is you can actually use two washing functions And so if you use to washing functions is in the rare case where two important features are confounded in in the hash output, it will only be for one of the washing functions. enough in output. Yeah, there's two reasons that's that's by the main one is that it works for any vocabulary. You do not have to build it a crucially so new documents, it will vactorize them correctly and the other is so you do not have to have the purpose a head of time the whole purpose. So that's Ikapty. Very useful and it helps with dimensionality. So like you, if you want to use triggers for example in your feature or quatrans in your feature space, there's meusilians in them, but that will not increase the size of your questioning. A passion bactrizer. It's also very fast, like the computationally so you guess might have heard of it. The text classifier that Facebook uses it's called Fast Texts the Washing Bacterizer I be the celebrity compound technological in the fast Text of Text classified algorithm. So basically what it does is it takes documents and represents them not as the list of word ideas, but a list of hatched in grant ideas. So you get a bunch of these hash numbers for every document and those can be very quickly computed and can input into a classified so ththere'there.'s few practical issues, but also complicate ya, the can is alright. like to about isinin outfits about grids, the w he ears there or do to say ye let me plazamia That's actually nice analogy, right? So basically even in the standard model language for the same in type you you will have multiple words on it anyway, right? And this is just adding to that. Making it slightly is making it more more crook, right? The court's representation: Multiple meanings or words are loaded on to the same string anyway. but this actually just takes that a little wooden worth and allows for even phrases to be loaded onto a specific string. Yes, so this fast text model is. were going to get into this like week as or so. but like the the Fast Text Word Embedding model or Document embedding model uses passion processing and so you can embed the washing ideas the same way that ipludin bad words. So I would say it. I Would say it's under explore though so there are a couple of papers doing it. but I think that this is a very interesting technology that has not been not that to be I think in principle yes but I mean if all you if all you had was the hash ideas it would be difficult to check this. but during the corrupt verification process you can keep track of swapping alongside do things in is and then one is. I think it's a little bit unpredictable but would happen but at a minimum it might add anyone to your embedding tribe because there is going to be some other words that are included that are mapped him to men and woman's the she site you like. it's interesting. I mean you could probably do some of this theoretically like the distribution of world frequencies like this on the world frietsay who get sagazip's distribution and I think in the fast text model they allow for like a million stews or something. so Whether using embeddings they're using this hash fertilizer as well as allowing to for a million different spots and so in that case like collisions are very rare but I'm sure you could work something and based on the distribution of frequencies it would right? So yes so so I mean especially why no if you're talking about the original fast text vactorizer on social media data so you know they are like hundreds and millions of documents and sminicalliio and even for strangers or quadrans. So the that's an interesting question. I Do not think that they get it that directly. Was there another? Yet to be sure that as the stand directors of Poet, traditional economic construction and artists basic met one producing yes, Andritan a permit evident yes Exactly as so. This is great. We're get in to that like week six about getting into to embeddings. But but the standard toconizer will give you and time with a list of words, a sequence of words or account over words like that. One of the atectet is on one hop in coding or for account in coding and that nose could be limited. so be the. But you can imagine that a document is just to list of adhesives as well. so it would be the same thing you could apply an embedding look up to like a list of words or a list of ash ideas. Okay, him going to speed up a bit and then we so we can finish the next week. So reaching vactorizer is one way to deal with hydimontionality, another is to filter the vocabulary based on your downstream task. So for example, if you just take all of the world frequencies and get the correlati with your outcome variable. so let's say' we're like in the morning at All the fighting were its paper we're trying tobasicaly that was kind of feature selection right? They were trying to check which features were correlated with the different political parties and so there is a number ways to do this. but Key Squared for example would be like a classic feature selection approach. Let me think about what I should cover now. So one issue which were going to come back to the lot in this course is relating statistics to meta data. So let's say in the fighting words paper A you the number of Republicans is increasing over time, but also the frequency of the word kill is increasing over time like over like a twenty year time periodand you would measure this correlation between Republic and a kill. But it's actually just a spurious correlation that if the word kill is increasing in both parties and the number of Republican is increasing. So this's like a kind of a classic correlation virus causation issue. You're actually not estimating a partisan dimension in text yours getting this time confounding trend and this is something that you nwits not recognized very much an nip, but when you're started applying to kind of social science issues, then this is going to be everywhere. And so rather than just estimate these rather as do feature selection based on the rock correlations, you might actually want to try to deconfound some of the frequency voters or at least the outcome vector. And for example, if you deny all the word frequencies by year and then estimate the relationship to democrats and republican, then you're getting just a within year variation and excluding it this time for point. And so this is something that will will continually come in back when we're trying to get up more casual or more informative representations and correlations. This is just like some side note is that you might have. you might want to get rid of both the time component of this correlation but also the geographical component. So link to Congressmen from taxes. for example, they might. you might have the word congresor from taxes might use the word to kill up often times, right? And so in order to remove both the year and the state variation, you can use a literary reason to do that and for why it would be the outcome and x would be the frequency of each word. You redress that against less categorical variables and take the residuals and then use those and your feature selection or machine learning to bask. So that's done. Found did feature selection. We're right at at four o'clock now. so I want to wrap up. And also for those of you who were doing a project I'd like you to stay for five more minutes to talk about it. We're going to write But now and I'm going to finish these slides at the beginning of of next time. Are there any questions or logistical or otherwise before we wrap up right? Thanks and will see you next week at the same time here in the same room. If you're interested in doing a project please stick around for a few minutes. Sigiisi A but cut in be on the tide process a holeayat at a of of more is is said to always if you have to wind up where the not object is too yes please he So if you have not sign for a peak yet and your he man to hear about it feel free to is to cart but you just go to hear about it. So if it's okay I'm just going to start talking and is freed to go yesterday. There is an additional to optional course credits for a course project. It can be done individually or in groups of up to force students and it's really just doing an application or a act based on what we're the continent of the course. so it's quite broad in terms of the topic. but usually its an app project on some social or political data or legal data but we're quite go and you can choose some thing and you're interested in. So just to give you some examples of the previous year's projects, so one of the top a better startup feature Startups: the Deep Judge team. Their project or their system started as the course project in this class and so they actually built this context sensitive legal search engine which's pretty amazing and they went on to get some by funding their headquarters at the I center. Now another group did some environmental regulation analytics and they want it in a swift grant. So just to show you how successful some of these projects have been, a number of them have been published and one on doing legal language modeling, another on inspiring legal language, another on medical documents summarization of medical documents, one before biting with a old student. Here he built a basically an automated question answering system for a coding class and then no Langu who's also a old student here publishes an attractive summarization system using reinforcement learning and so those last to those are individual projects. So even if you do a project by yourself, those have been in just being successful. There's a number other projects that that I think they have good chance of being published at some point either an top conference or a social science journal. We made a partisan treat generator when standard. Stump Bombelli who's another students here. She did an analysis of immigrant attitudes and historical newspapers. One group did and deep it like normal nets and instrumental liberals. The kind of caused a little of paper. One group did parties and question answering. it does not have to be text. One of the students did an audio an analysis which was fine. if we want to do audio or other images we're not in cover that in the course material. but you're welcome to do that for your projects and then some of them have just been kind of classification projects in terms of picking a topic, You're welcoming to pick one on your own and I can provide back if I think it's a good topic or not. for how to modify it, some of you are ready. Asked about this. we have a list of suggested topics like we have a short list of topics that I think are kind of interesting to do right now and then a longer list if you want to see that. just send opera, email wearing people, less economist everybody list of the same time. With some doubts it would also be good to think of which maybe one or two or three topics you're interested in because then I can provide some thoughts about advice about what's doable for course project One you take a look at these slides from the gethub, but once you formed a group, send us a list of their team members and it be useful. You do not have to be right a Cv for this or something, but would be useful. to know what courses you' we take in and so that we can set expectations and things and then some of the projects have either new or after we advise them but there's a number of other project advisors that will help out this. So for those if you need help picking a topic we can need to disc test. maybe it could be Onzumor in person and if you already know your topic we will also need so that I help you get start. So those are the points that I wanted to to bring up for Now you do need to say you do not need to pick a top topic for a month a few weeks from now so this is none of us urgent. but I'll send the list of project ideas A we go from there, are there any questions or concerns at the moment A right to keep the such with any questions and will see you next week. Teach changes are sejetd.\",\n \"\\n\\nMUSIC ENHANCEMENT VIA IMAGE TRANSLATION AND VOCODING Nikhil Kandpal* Oriol Nieto, Zeyu Jin University of North Carolina at Chapel Hill Computer Science Department Chapel Hill, NC, USA Adobe Research San Francisco, CA, USA ABSTRACT to develop a solution that works for polyphonic signal enhancement and reflects the unique qualities of music perception. Our approach performs enhancement on the recording' s melspectrogram representation. This is achieved by treating the melspectrogram as an image and training an image-to-image translation model similar t0 Pix2Pix [3] to transform low-quality melspectrogram into that of a high-quality signal. We hypothesize that it is easier to enhance polyphonic signals in the mel-spectrogram domain as polyphonic sources are additive and have a very small temporal span compared to waveforms Finally; to map generated high-quality mel-spectrograms to perceptually realistic waveforms we train vocoding model based on DiffWave [4|. Training this model on only the high quality samples of music performance makes it robust to the artifacts that reside in the synthetic mel-spectrogram_ We evaluate Our approach by performing listening test with 211 participants, and we show that this approach achieves a much better perceptual enhancement than several state-of-the-art techniques_ We also compare the subjective listening test scores with widely used audio quality metrics and suggest that, similar to speech enhancement, these metrics correlate poorly with human perception [1[5 : With this work; we hope to motivate both future research in music enhancement as well as music quality perceptual metrics akin to those in the speech literature [6], [1. To promote further research, audio samples generated in our experiments and source code are provided at Our project website In this paper; we refer to Pix2Pix models operating on melspectrograms as MelzMel models and vocoding applied to the music domain as musecoding. We summarize our contributions as follows: A music enhancement model leveraging recent work on conditional image synthesis and vocoding: generative process for simulating realistic low-quality music recordings from professional-quality recordings An analysis of the reliability of common audio enhancement evaluation metrics in the music domain_ 8 Consumergrade music recordings such as those captured by mo bile devices typically contain distortions in the form of background noise, reverb_ and microphone-induced EQ. This paper presents 2 deep learning approach to enhance low-quality music recordings by combining (i) an image-to-image translation model for manipulating audio in its mel-spectrogram representation and (ii) a music vocod8 ing model for mapping synthetically generated mel-spectrograms to perceptually realistic waveforms We find that this approach to music enhancement outperforms baselines which use classical methods for mel-spectrogram inversion and an end-to-end approach directly mapping noisy waveforms t0 clean waveforms. Additionally, 8 in evaluating the proposed method with listening test; we analyze the reliability of common audio enhancement evaluation metrics when used in the music domain: Index Terms _ Music Enhancement; Image-to-Image Translation, Diffusion Probabilistic Models, Vocoding 7 1. INTRODUCTION With the rise of Internet influencers and music hobbyists, large portion of music content is created with cheap and accessible recording devices in non-treated environments _ While being audible, these recordings often have degraded quality stemming from background noise, unpleasant reverb, and resonance caused by the microphone and the environment_ This prompts US t0 investigate quality enhance1 ment for music signals, transforming low-quality amateur recordings into professional ones. The main difficulty of such an endeavor is that $o many aspects of the low-quality recording setup are unknown Parameters of the recording device, such as frequency response characteristics, vary drastically across different hardware. Additionally, acoustic properties such as the size, shape, and reflectivity of the recording environment vary between different recording setups_ Finally, background noise is hard to capture and generalize, especially non-stationary 2. RELATED WORK noise. A solution that faithfully transforms low-quality recording into what it would sound like recorded professionally must implicTo our knowledge there is little prior work studying music quality itly or explicitly infer all of these aspects from the signal alone In enhancement. The work most similar to our contributions focuses on speech enhancement; end-to-end methods such as HiFi-GAN [1 and speech enhancement; conditional speech synthesis, O music source Demucs |2] achieve this by extracting the speech source from a mixseparation_ ture of sources However; music signals are often polyphonic, i. eEarly approaches to speech enhancement have used classithere can be an arbitrary number of sources to be extracted at once_ cal signal processing techniques such as Wiener filtering [8] and Moreover; the perception of music quality typically differs from that non-negative matrix factorization [9|. More recently, deep learningof speech: For example, human listeners may find reverb pleasant based methods have achieved state-of-the-art on speech enhancein music, while it is usually undesired in speech. Therefore, we aim ment_ These methods either manipulate the audio in its magnitude Work done during an internship with Adobe Research Ihttps: nkandpa2. github 10 music enhancement\\n\\nHigh-Quality Spectrogram MelzMel GAN Musecoder DDPM Low-Quality Spectrogram Conv Encoder Conv Decoder Denoising Steps High-Quality Waveform Noise Diffusion Steps Fig: 1. Model architecture of our Mel2Mel + Diffwave model. First, low-quality mel-spectrogram is enhanced by conditional GAN. The resulting synthetic mel-spectrogram is then \\\"musecoded\\\" into a waveform by a Denoising Diffusion Probabilistic Model (DDPM) spectrogram representation (followed by spectrogram inversion method to recreate the corresponding waveform) |10} H[2| or map directly from the low-quality waveform to a cleaned waveform 113] 2 : Methods that operate on the time~frequency domain generally produce audible artifacts due to the use of phase reconstruction algorithms like the Griffin-Lim algorithm [14] A recent work addresses this with neural-network based vocoders [15], yet its quality is not 0 par with an end-to-end approach [16p Alternatively; methods that work on the time domain typically require more training steps |1| Conditional speech synthesis techniques produce speech wave _ forms from conditioning information such as magnitude spectrograms, problem commonly known as vocoding Some state-of-theart vocoding methods involve using generative adversarial networks [17[8, or denoising diffusion probabilistic models 4/19] for generating audio. Music source separation focuses on taking a mix of multiple music stems (vocals, drums, etc. ) and separating the mix into its individual sources Some approaches to music source separation operate by masking spectrograms |20] or directly mapping the mix waveform t0 individual source waveforms [21122/_ The music enhancement problem is different than music source separation, since our goal is not only to extract all musical sources from a noisy mixture but also to reduce reverb and adjust EQ such that the listening experience is improved dataset we assume access to high-quality recordings and define generative process for simulating low-quality ones. First, we simulate the reverb and varied microphone placements of a nonprofessional recording environment by convolving the high-quality music signal with a room impulse response. Next; we apply additive background noise scaled to achieve randomly sampled SNR between 5 and 30 dB. Finally, we simulate a low-quality microphone frequency response by applying 4-band equalization with randomly sampled gains between15 and 15 dB and frequency bands from 0-200, 200-1000, 1000-4000, and 4000-8000 Hz 3. 3. Mel-Spectrogram Enhancement with Mel2Mel Our first step in music enhancement is modeling the distribution of high-quality mel-spectrograms conditioned 0n their low-quality counterparts To estimate this distribution, we use existing work on image-to-image translation with conditional adversarial networks in an approach similar to [12|. In this framework a generator and a discriminator are trained using an aligned dataset of low and high-quality recording pairs. The generator maps from low to high-quality mel-spectrograms with the objective of maximizing the discriminator's loss and minimizing the C1 distance between the generated mel-spectrogram and the ground truth high-quality mel-spectrogram. The discriminator is trained to classify whether a given mel-spectrogram is generated or comes from the true data distribution. It performs this classification on patch-wise basis, predicting a class for each patch in the input melspectrogram_ For this reason, the discriminator acts aS a learned loss function for the generator which enforces realistic local features and the /1 loss enforces global consistency with the ground truth melspectrogram_ 3. 0 METHODS 3. 1. Modeling Approach In this paper; we investigate the approach of enhancing music in its mel-spectrogram domain, as it is easier to represent complex harmonic structures and polyphonic sound sources_ We then transform the resulting mel-spectrograms to waveforms through Diffwavebased vocoder (a process that in this context could be more aptly named \\\"musecoding\\\"). Decoupling waveform generation from melspectrogram enhancement allows us to train a musecoder that is not only robust to noise and other artifacts, but can also be used for any generation and enhancement task without the need of retraining: Figure [I depicts a block diagram of our proposed architecture: This approach is motivated by recent advances in vocoding that generate natural-sounding speech from mel-spectrograms [4]. 3. 4. Musecoding Recent work has shown that deep learning models can generate perceptually realistic waveforms from speech mel-spectrograms _ In our experiments, we evaluate the Diffwave |4] vocoder applied to music a process that we call \\\"musecoding\\\" Diffwave is a denoising diffusion probabilistic model (DDPM) This class of models defines forward diffusion process which iteratively adds gaussian noise to audio waveforms from the training dataset A model is then trained to estimate the reverse transition distributions of each noising step conditioned on the mel-spectrogram of the clean audio. Sampling from this model requires sampling noise from a standard gaussian and iteratively denoising using the reverse transition probability distributions from the model_ For further discussion of DDPMs see [4] and [23|. 3. 2 Data Simulation The modeling techniques we consider in this paper require aligned pairs of highand low-quality music recordings. To construct such\\n\\nModel Clean MOS 1 4. 39 = 0. 05 4. 06 = 0. 06 3. 01 + 0. 09 2. 85 =0. 09 4. 3. Baselines We evaluate our approach against two separate baselines First, we pair Mel2Mel for mel-spectrogram enhancement with inverse mel-scaling and the Griffin-Lim algorithm for musecoding. Both inverse mel-scaling and Griffin-Lim require solving optimization problems [291, So we run both solvers for 100 iterations, which yields per-sample runtime comparable to that of the Diffwave musecoder: Our second baseline is an end-to-end approach for music enhancement: Namely, we use the Demucs model architecture [21 and train it using the /1 reconstruction loss on our dataset of lowand high-quality recording pairs This matches the original training objective used for this architecture on the task of music source separation. We train this model for 360 epochs with batch size 64 and learning rate 0. 0003. We find that after this number of epochs the validation loss plateaus MelzMel + Diffwave Mel2Mel + Griffin-Lim No Enhancement Table 1. Mean Opinion Scores in a human listening test. As a musecoding baseline, we also consider mel-spectrogram inversion with inverse mel-scaling and the Griffin-Lim algorithm [14]. 4. EXPERIMENT SETUP 4. 1. Dataset We train and evaluate models on the Medley-solos-DB dataset [24]_ containing 21, 572 three-second, single-instrument samples recorded in professional studios. We exclude the distorted electric guitar samples to avoid fitting our models to production effects. We use 5841 samples for training, 3494 for validation and the rest for testing: We start by downsampling Our data to 16 kHz following the setup of prior vocoding work [4[71. This sample rate has shown to be favored by most speech enhancement work [D[2 and can be pOtentially super-resolved to 48 kHz with bandwidth extension techniques [5]. Using the procedure described in Section[. 2] we generate a dataset of 'highand low-quality recording pairs. For simulation of low-quality recordings, we source room impulse responses from the DNS Challenge dataset [25_ and realistic background noise from the ACE Challenge dataset |26/. As a final step, we apply a low-cut filter to remove nearly inaudible low frequencies below 35 Hz and normalize the waveforms to have a maximum absolute value of 0. 95_ We find that this treatment helps improve our models' training stability: When evaluating, we apply the same treatment (low-cut filter at 35 Hz and normalization) before applying OUr enhancement models. 4. 4. Evaluation Metrics To evaluate the results of different enhancement models we conducted Mean Opinion Score (MOS) test with human listeners on Amazon Mechanical Turk (AMT) Additionally; we evaluate enhancement methods by computing the frequency-weighted segmental SNR (fwSSNR) 1301, multi-resolution spectrogram loss (MRS) [34| C1 spectrogram distance, and Frechet Audio Distance (FAD) [32 between enhanced and clean reference signals_ In Sec tion|5. 3 we analyze the effectiveness of these objective metrics at approximating human listener ratings in the music domain. 5. 0 RESULTS 5. 1. Mean Opinion Score Test To evaluate our proposed MelzMel Diffwave music enhancement model, we conducted an MOS test with human listeners on AMT. We used 200 audio samples from our test set, added 8 different types of simulated degradation and passed these low-quality waveforms through our method, MelzMel + Griffin-Lim; and Demucs_ The lowquality, enhanced, and ground truth high-quality samples were then presented to human listeners who were asked to give a quality score from to 5. We used the ground truth high-quality recording as high anchor and the same recording with 0 dB white noise as low anchor Each Human Intelligence Task (HIT) started with screening test in which human listeners were required to identify which one of 5 audio samples sound the same aS a reference sample. 4 out of the 5 samples are passed a small amount of effects including low pass filters, high pass filters, comb filters, and added noise. Passing the screening test was required to continue The rest of the HIT consisted of 34 tests in which were validation tests to check if listeners were paying attention_ If they failed the validation test, the entire HIT was invalidated. In the end we collected 9, 095 answers from 214 listeners. The results shown in Table 1 suggests that MelzMel with a Diffwave musecoder achieves the highest MOS with a score near that of clean audio from the dataset_ 4. 2. Model Architectures and Hyperparameters In all experiments, we compute mel-spectrograms with 128 mel bins, an FFT size of 1024, and 256 sample hop length: When training models that generate O are conditioned 0n mel-spectrograms, we use log-= scale amplitudes to reduce the range of values and to avoid positive restrictions on our models domain Or range. The Mel2Mel generator described in SectionB3] consists of 2 downsampling blocks; each containing a 2D convolutional kernel of size 3 and stride 2, instance normalization |27 | and ReLU activation functions. This is followed by 3 ResNet blocks 28] with kernel size 3 and instance normalization_ Finally, the representation is upsampled back to the original dimensionality of the input with two upsampling blocks, each containing a transposed convolutional kernel of size 3 and stride 2, instance normalization, and ReLU activation functions. The Mel2Mel discriminator is a fully convolutional model made Up of three blocks, each containing a convolutional kernel of size 4 and stride 2, instance normalization, and LeakyReLU activation function. The last layer does not have any normalization O activation function Both the generator and discriminator are trained with batch size of 64 and learning rate of 0. 0002 for 200 epochs The Diffwave model described in SectionBAuses the architecture and training objective described in /4]. We train this model for 3000 epochs using a batch size of & and a learning rate of 0. 0002 5. 2. Perturbation Ablation Study To gain insight into which perturbations are handled most effectively by each enhancement model, we perform an ablation study isolating each perturbation introduced in the low-quality signal generative process Table[contains mean opinion scores for each enhancement\\n\\nModel Clean Random EQ 4. 35 = 0. 06 4. 15 =0. 07 2. 98 = 0. 1 3. 39 + 0. 10 3. 99 + 0. 08 SNR 5 SNR 10 4. 27 =0. 06 4. 24 = 0. 06 3. 53 + 0. 09 3. 07 = 0. 1 2. 71 +0. 1 SNR 15 4. 46 = 0. 06 3. 96 = 0. 09 3. 18 = 0. 11 2. 85 = 0. 11 3. 04 = 0. 12 DRR 0 DRR 3 DRR 6 4. 24 =0. 07 4. 01 = 0. 08 3. 10 = 0. 08 2. 55 = 0. 10 2. 48 + 0. 11 4. 28 = 0. 04 4. 19 = 0. 06 3. 77 = 0. 06 3. 84 = 0. 06 2. 82 + 0. 07 2. 77 _ 0. 09 3. 13 = 0. 07 3. 21 + 0. 07 4. 01 = 0. 06 3. 91 = 0. 07 4. 42 =0. 06 3. 96 = 0. 08 2. 99 + 0. 10 3. 30 + 0. 10 4. 21 +0. 07 Mel2Mel + Diffwave Mel2Mel + Griffin-Lim Demucs No Enhancement Table 2. Mean Opinion Scores in a human listening test. Each column contains the ratings for single perturbation type: EQ, additive background noise at different signal-to-noise ratios (SNR), and reverb at different direct-to-reverberant ratios (DRR)_ Enhancement Metric Rank Correlation with MOS fwSSNR 0. 5 ~MRS 056 ~LI 0. 4 ~FAD 053 Model fwSSNR 9. 04 7. 61 6. 58 8. 23 6. 96 MRS 1. 40 1. 57 1. 65 1. 80 1. 89 Ll + 1350 1, 57 1. 69 1. 83 2. 16 FAD 4. 73 4. 54 3. 98 5. 54 5. 90 Independent Training Joint Fine-tuning Joint Training Sequential Training No Enhancement Table 3. Spearman rank correlation between MOS test ratings and audio enhancement metrics. Table 4. Performance of MelzMel + Diffwave enhancement models using different training schemes model applied to signals with randomly sampled EQ, additive noise with signal-to-noise ratios (SNR) of 5, 10, and 15 dB, and reverb with direct-to-reverberant ratios (DRR) of 0, 3, and 6 dB. This ablation shows that the Mel2Mel Diffwave model excels at removing noise even at SNR values as low as 5 dB and at undoing 4-band equalization simulating a non-flat microphone frequency response. Interestingly, none of the models tested perform dereverberation very well, and in fact degrade signals that contain no noise and only simulated reverb: This may be due to train-test mismatch; since all samples enhanced during training time contained some level of additive noise. This ablation also lends insight into the types of perturbations that affect human listeners\\\"perception of music_ From the difference between the scores given to clean samples and non-enhanced samples, it is clear that additive noise impacts the listener'$ perception significantly while reverb is mostly ignored. 5. 4. Alternate Training Schemes In Section |3. 1] we motivated approaching music enhancement by training two decoupled models that separately handle melspectrogram enhancement and musecoding: Here, we investigate training schemes for these models other than independently training them on their respective tasks In addition t0 independent training, we (1) finetune the Mel2Mel generator and Diffwave musecoder jointly using the Diffwave objective, (2) train the models sequentially by first training the musecoder and then training the Mel2Mel generator with musecoder parameters frozen; and (3) train the MelzMel generator and musecoder jointly as single model using the Diffwave objective. Table shows the performance of the resulting models. In Section[3]we discussed the reliability of using these metrics for evaluating algorithms, and find that FAD is the most perceptually aligned metric when it comes to denoising Given this observation, our results suggest that joint training may yield better denoising performance than independent training: Joint training has the added benefit that only a single model is trained using a non-adversarial objective. However; this comes with the downside that the trained model cannot be split into enhancement and musecoding sub-models: Future work could focus on further exploring such training schemes. 5. 3. Perceptual Alignment of Objective Metrics The results of the MOS test also provide a mechanism t0 evaluate how well objective metrics for audio quality align with human perception in the music domain We measure fwSSNR, MRS, FAD, and 61 spectrogram distance on the same samples submitted for MOS evaluation_ We then take the mean score across all samples with given perturbation type (i. e. SNR 5, DRR 0, etc. ) and perform Spearman rank correlation with the mean scores measured in the human MOS test_ In Tablel3] we show the rank correlation for each objective metric_ We find that none of the four metrics evaluated correlate very strongly with human opinion scores, the highest achieving a rank correlation of 0. 56_ We also identify particular failure modes of these metrics AlL four metrics fail to identify robotic artifacts induced by the GriffinLim algorithm and actually rate the Mel2Mel + Griffin-Lim model as the best of all models we tested. Additionally, fwSSNR MRS, and 61 spectrogram distance all fail to identify additive noise effectively, and rate non-enhanced samples at SNR values of 10 and 15 dB as being better than any enhancement model output: FAD does not have this failure mode. 6. CONCLUSION We propose a music enhancement model that decomposes the task into mel-spectrogram enhancement and waveform synthesis from mel-spectrograms_ This model was trained using high-quality samples from a public dataset paired with low-quality samples generated by simulating artifacts that typically appear in amateur recordings_ A human MOS test shows that this model outperforms state-of-theart baselines Additionally, we found that current objective metrics for audio enhancement do not accurately reflect human perception of music. We hope this work encourages researchers to further advance the rather unexplored and yet timely topic of automatic music enhancement; either by designing more performant models 01 by proposing metrics that better align with human music perception.\\n\\n7 _ REFERENCES [17] Kundan Kumar; Rithesh Kumar; Thibault de Boissiere, Lucas Gestin, Wei Zhen Teoh, Jose Sotelo, Alexandre de Brebisson Yoshua Bengio, and Aaron Courville_ \\\"Melgan: Generative adversarial networks for conditional waveform synthesis,\\\"2019. [18] Jaeseong You_ Dalhyun Kim; Gyuhyeon Nam, Geumbyeol Hwang, and Gyeongsu Chae, \\\"Gan vocoder: Multi-resolution discriminator is all you need;\\\" 2021. [19] Nanxin Chen, Yu Zhang, Heiga Zen, Ron J. Weiss Mohammad Norouzi, and William Chan, Wavegrad: Estimating gradients for waveform generation; 2020. [20] Romain Hennequin, Anis Khlif, Felix Voituret; and Manuel Moussallam_ \\\"Spleeter: fast and efficient music source separation tool with pre-trained models Journal of Open Source Software, vol. 5, pp. 2154, 06 2020. [21] Alexandre Defossez, Nicolas Usunier; Leon Bottou, and Francis Bach; \\\"Music source separation in the waveform domain; 2021. [22] Yi Luo and Nima Mesgarani_ \\\"Conv-tasnet: Surpassing ideal time-frequency magnitude masking for speech separation;' IEEEIACM TASLP vol. 27, no. &, pp. 1256-1266, Aug 2019. [23] Jonathan Ho, Ajay Jain, and Pieter Abbeel, \\\"Denoising diffusion probabilistic models;' 2020. [24] Vincent Lostanlen and Carmine-Emanuele Cella, Deep convolutional networks on the pitch spiral for musical instrument recognition;' 2017. [25] Chandan K Reddy, Harishchandra Dubey; Kazuhito Koishida, Arun Nair; Vishak Gopal_ Ross Cutler Sebastian Braun, Hannes Gamper; Robert Aichner; and Sriram Srinivasan ~Interspeech 2021 deep noise suppression challenge;' 2021 _ [26] J. Eaton, N. D. Gaubitch, A_ H. Moore, and P A. Naylor; \\\"The ace challenge corpus description and performance evaluation,'in 2015 IEEE WASPAA, 2015, pp. 1-5_ [27] Dmitry Ulyanov, Andrea Vedaldi, and Victor Lempitsky, Instance normalization: The missing ingredient for fast stylization, ; 2017. [28] Kaiming He, Xiangyu Zhang, Shaoqing Ren; and Jian Sun_ Deep residual learning for image recognition;' in 2016 IEEE CVPR, 2016, pp. 770-778_ [29] Yao-Yuan Yang; Moto Hira, Zhaoheng Ni, Anjali Chourdia, Artyom Astafurov, Caroline Chen, Ching-Feng Yeh; Christian Puhrsch_ David Pollack; Dmitriy Genzel, Donny Greenberg, Edward Z Yang; Jason Lian, Jay Mahadeokar; Jeff Hwang; Ji Chen, Peter Goldsborough, Prabhat Roy; Sean Narenthiran, Shinji Watanabe, Soumith Chintala, Vincent Quenneville-Belair; and Yangyang Shi, Torchaudio: Building blocks for audio and speech processing; arXiv preprint arXiv:2110. 15018, 2021. [30] YHu and Philipos C. Loizou; valuation of objective quality measures for speech enhancement_ IEEE TASLP vol: 16, pp_ 229-238,. 2008. [31] Ryuichi Yamamoto, Eunwoo Song; and Jae-Min Kim; Parallel wavegan: A fast waveform generation model based on generative adversarial networks with multi-resolution spectrogram;\\\" 2020_ [32] Kevin Kilgour; Mauricio Zuluaga; Dominik Roblek; and Matthew Sharifi, Frechet audio distance: A metric for evaluating music enhancement algorithms,\\\"2019. 1] Jiaqi Su, Zeyu Jin, and Adam Finkelstein, Hifi-gan: Highfidelity denoising and dereverberation based o speech deep features in adversarial networks 2020. [2] Alexandre Defossez, Gabriel Synnaeve, and Yossi Adi, \\\"Real time speech enhancement in the waveform domain, Interspeech2020, 2020. [3] Phillip Isola, Jun-Yan Zhu; Tinghui Zhou, and Alexei A Efros, ~Image-to-image translation with conditional adversarial networks_ 2018_ [4] Zhifeng Kong; Wei Ping; Jiaji Huang, Kexin Zhao, and Bryan Catanzaro, Diffwave: versatile diffusion model for audio synthesis,'2021_ [5] Jiaqi Su, Yunyun Wang. Adam Finkelstein; and Zeyu Jin Bandwidth extension is all you need, in ICASSP 2021-2021. IEEE, 2021, pp. 696-700. [6] Pranay Manocha, Zeyu Jin, Richard Zhang; and Adam Finkelstein, \\\"Cdpam: Contrastive learning for perceptual audio similarity;\\\" in ICASSP 2021-2021. IEEE, 2021, pp. 196-200. [7] Chandan KA Reddy, Vishak Gopal, and Ross Cutler; Dnsmos: A non-intrusive perceptual objective speech quality metric to evaluate noise suppressors in ICASSP 2021-2021. IEEE, 2021, pp. 6493-6497. [8] P Scalart and J. V. Filho, \\\"Speech enhancement based on priori signal to noise estimation;\\\" in 1996 IEEE ICASSP Proceedings, 1996, vol. 2, pp. 629-632 vol: 2 [9] Hideaki Kagami, Hirokazu Kameoka; and Masahiro Yukawa, \\\"Joint separation and dereverberation of reverberant mixtures with determined multichannel non-negative matrix factorization,'in 2018 IEEE ICASSP, 2018, pp. 31-35_ [10] Kun Han, Yuxuan Wang, DeLiang Wang; William $. Woods, Ivo Merks, and Tao Zhang; Learning spectral mapping for speech dereverberation and denoising;' IEEEIACM TASLP, vol. 23, no. 6, pp. 982-992, 2015. [11] Donald S_ Williamson and DeLiang Wang; \\\"Speech dereverberation and denoising using complex ratio masks;\\\" in 2017 IEEE ICASSP, 2017, pp. 5590-5594. [12] Daniel Michelsanti and Zheng-Hua Tan; \\\"Conditional generative adversarial networks for speech enhancement and noiserobust speaker verification,\\\"Interspeech 2017, Aug 2017. [13] Santiago Pascual, Joan Serra, and Antonio Bonafonte, ~Towards generalized speech enhancement with generative adversarial networks,'2019. [14] D. Griffin and Jae Lim; ~Signal estimation from modified short-time fourier transform IEEE Transactions on Acoustics, Speech, and Signal Processing, vol_ 32, no. 2, Pp. 236 243, 1984 [15] Adam Polyak; Lior Wolf;, Yossi Adi, Ori Kabeli, and Yaniv Taigman, High fidelity speech regeneration with application to speech enhancement;' in ICASSP 2021-2021. IEEE, 2021, pp. 7143-7147 [16] Jiaqi Su, Zeyu Jin, and Adam Finkelstein; \\\"Hifi-gan-2: studioquality speech enhancement via generative adversarial networks conditioned on acoustic features;' in 2015 IEEE WASPAA, 2021_\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"summary\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 7,\n \"samples\": [\n \"The Ezurich Lecture on Machine Learning for Healthcare, presented by Julia Vogt, Valentina Boeva, and Gunnar Ratsch from the Institute for Machine Learning, focused on the use of machine learning (ML) in medical image analysis, particularly in digital pathology. It covered ML basics, including segmentation techniques like superpixels and Markov Random Fields, and image classification through Convolutional Neural Networks (CNNs). The lecture emphasized ML's role in analyzing medical images, discussing challenges like object segmentation and image denoising, and introduced methods to simplify image complexity and improve classification tasks. It also highlighted the application of pre-trained networks and the development of clinical-grade decision support systems, showcasing a method for integrating ML into clinical workflows to enhance diagnostic accuracy and reduce workload by predicting tumor probabilities and identifying lesions in whole slide images, underscoring ML's potential to advance precision medicine and clinical data analysis in healthcare.\",\n \"The instructor begins the class by offering a remote attendance option and answers questions about homework policies, emphasizing the importance of understanding the material over merely copying notes. They introduce an Edge Flow page for homework submissions and advise computer science students to consult their department regarding course credits. The discussion then shifts to course projects, their deadlines, and credit considerations. The session focuses on computational statistics, particularly on document representation and tokenization, stressing the significance of preprocessing text through methods like stemming, lemmatization, and stop word removal for social science research. The instructor explains tokenization techniques, including n-grams and hashing, and discusses feature selection and the necessity of deconfounding in text analysis. The class concludes with encouragement for students to undertake their projects, drawing inspiration from successful examples and offering guidance on topic selection and project expectations.\",\n \"Nikhil Kandpal, Oriol Nieto, Zeyu Jin, and their team developed a novel deep learning method to improve the quality of low-quality music recordings. Their technique treats the melspectrogram of a recording as an image and uses an image-to-image translation model, inspired by Pix2Pix, to upgrade it to a higher quality. This enhanced melspectrogram is then converted back to audio with a vocoding model based on DiffWave, ensuring realistic sound. The method, particularly effective for polyphonic music, outperformed existing techniques in a listening test with 211 participants, highlighting its perceptual enhancement capabilities. However, the study also points out the inadequacy of current audio quality metrics in accurately reflecting human perception of music quality, suggesting the need for better metrics. The researchers have shared their code and audio samples to encourage further exploration in music enhancement and the development of improved perceptual metrics for assessing music quality.\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"summary_source\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"gpt-4-0125-preview\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 5 } ], "source": [ "from pathlib import Path\n", "import pandas as pd\n", "\n", "GAUNTLET_URL = \"https://www.dropbox.com/scl/fi/u3bjyjlb474tskbjyzmpg/gauntlet_w_ref_summaries.parquet?rlkey=qjsz6htflg77monh2y5jb3kya&dl=1\"\n", "\n", "df = pd.read_parquet(GAUNTLET_URL).convert_dtypes()\n", "df.info()\n", "df.sample(n=7)" ] }, { "cell_type": "markdown", "source": [ "let's encode the second half of each doc, to avoid giving away obvious clues like the titles etc at the beginning of a document\n" ], "metadata": { "id": "9PsJciOu7O77" } }, { "cell_type": "code", "source": [ "import re\n", "\n", "def second_half_of_text(text, split_by='word'):\n", " \"\"\"\n", " # Example usage:\n", " text_by_word = \"This is a sample string, with several words; and punctuation!\"\n", " print(second_half_of_text(text_by_word, split_by='word'))\n", "\n", " text_by_line = \"This is the first line.\\n\\nThis is the second line.\\nThis is the third line.\\nThis is the fourth line.\"\n", " print(second_half_of_text(text_by_line, split_by='line'))\n", " \"\"\"\n", " # Define regex patterns for splitting\n", " if split_by == 'line':\n", " pattern = r'\\n+'\n", " else: # Default to splitting by word\n", " pattern = r'\\s+'\n", "\n", " # Use regex to split based on the specified pattern\n", " elements = re.split(pattern, text)\n", "\n", " # Calculate midpoint and return the second half\n", " mid = len(elements) // 2\n", " if split_by == 'line':\n", " return '\\n'.join(elements[mid:])\n", " else:\n", " return ' '.join(elements[mid:])\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "6NObrRTq3sGS", "outputId": "9577665c-8d5b-455e-e997-3630085e6df1" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "with several words; and punctuation!\n", "This is the third line.\n", "This is the fourth line.\n" ] } ] }, { "cell_type": "code", "source": [ "# compute 'second half' of each doc as text\n", "\n", "df[\"text\"] = df.document_text.apply(second_half_of_text, split_by='word')\n", "df[\"text\"].head()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "LbH-JKWX309S", "outputId": "9f81e60d-2976-4e8a-943b-727080e8fc27" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "0 acknowledging presence is up. I do not have th...\n", "1 So when this is like the kind of classic workh...\n", "2 right, but one is looted the other, but in ord...\n", "3 somehow. It's just there, which means if we ev...\n", "4 a general principle of computation. So Like in...\n", "Name: text, dtype: object" ] }, "metadata": {}, "execution_count": 7 } ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 49, "referenced_widgets": [ "f99a45cc534f40fcb0271476fc98cfe6", "74ca7ebaa04e453ab17d8279e453e3d0", "ae26a373a71143389421b9fe362eae7c", "211e85994998449293e39794e1854572", "74aeb36848fd4e6fa8e2e5da95e047ff", "d1aeef44b6db4a1f848d862da85afae7", "367cc76a5d154016981d21b8c0fce4d4", "5d29fd08bd434a5e9eae8f752502e10c", "546e286609814b37afcd1f281daf380c", "440ecfd04e384aee9edd6090f44ed0e3", "c34234c80bcb434d9a31932320dd136f" ] }, "id": "Khl_8dGQA14R", "outputId": "84832730-939d-4d25-b8f4-075fe363f691" }, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "Batches: 0%| | 0/10 [00:00 Tensor:\n", " \"\"\"\n", " Computes the cosine similarity cos_sim(a[i], b[j]) for all i and j.\n", "\n", " source: sentence_transformers/util.py\n", "\n", " :return: Matrix with res[i][j] = cos_sim(a[i], b[j])\n", " \"\"\"\n", " if not isinstance(a, torch.Tensor):\n", " a = torch.tensor(a)\n", "\n", " if not isinstance(b, torch.Tensor):\n", " b = torch.tensor(b)\n", "\n", " if len(a.shape) == 1:\n", " a = a.unsqueeze(0)\n", "\n", " if len(b.shape) == 1:\n", " b = b.unsqueeze(0)\n", "\n", " a_norm = torch.nn.functional.normalize(a, p=2, dim=1)\n", " b_norm = torch.nn.functional.normalize(b, p=2, dim=1)\n", " return torch.mm(a_norm, b_norm.transpose(0, 1))\n", "\n", "\n", "# Compute cosine similarity matrix efficiently\n", "cosine_scores = cos_sim(embeddings, embeddings).cpu()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "WcI6ZoZQZJ_h", "outputId": "29512761-b55f-4be4-9d47-345b5a9491bd", "cellView": "form" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "There are 171 total pairs between all docs\n", "Results for:\tBEE-spoke-data/bert-plus-L8-v1.0-syntheticSTS-4k\n", "\n", "\n", "Document Similarity Pairs (domain | filename):\n", "Document 1 Document 2 Score\n", "-------------------------------------------------------------------------------------------------------------------\n", "Top 20 most similar pairs:\n", "ASR | ASRnlp_law_lecture_week_1_v_2_c_transcr... ASR | ASRnlp_law_lecture_week_2_v_2_c_transcr... 0.9513\n", "ASR | ASRnlp_law_lecture_week_2_v_2_c_transcr... ASR | ASRnlp_law_lecture_week_3_part_1_v_2_c_... 0.9504\n", "ASR | ASRnlp_law_lecture_week_1_v_2_c_transcr... ASR | ASRnlp_law_lecture_week_3_part_1_v_2_c_... 0.9494\n", "ASR_cleaned | ASR-whisper-rpunctuated_Noam Ch... ASR_cleaned | ASR-whisper-rpunctuated_Noam Ch... 0.9297\n", "ASR | ASRnlp_law_lecture_week_3_part_1_v_2_c_... ASR_cleaned | ASR-whisper-rpunctuated_Noam Ch... 0.9219\n", "ASR | ASRnlp_law_lecture_week_1_v_2_c_transcr... ASR_cleaned | ASR-whisper-rpunctuated_Noam Ch... 0.9068\n", "ASR | ASRnlp_law_lecture_week_2_v_2_c_transcr... ASR_cleaned | ASR-whisper-rpunctuated_Noam Ch... 0.8978\n", "OCR_academic_paper | OCR_PAPER_dall-e-2-annot... OCR_academic_paper | OCR_PAPER_Hong et al. -... 0.8887\n", "ASR | ASRnlp_law_lecture_week_2_v_2_c_transcr... ASR_cleaned | ASR-whisper-rpunctuated_Noam Ch... 0.8832\n", "Script | script_strangersonatrain.txt Script | script_sunsetblvd..txt 0.8830\n", "ASR | ASRnlp_law_lecture_week_3_part_1_v_2_c_... ASR_cleaned | ASR-whisper-rpunctuated_Noam Ch... 0.8826\n", "OCR | OCR_ML4HLecture04RepresentationLearning... OCR | OCR_ML4HLecture05-NLP.pptx_.txt 0.8778\n", "OCR_academic_paper | OCR_PAPER_Hong et al. -... OCR_academic_paper | OCR_PAPER_Kandpal, Nieto... 0.8767\n", "ASR | ASRnlp_law_lecture_week_1_v_2_c_transcr... ASR_cleaned | ASR-whisper-rpunctuated_Noam Ch... 0.8714\n", "OCR | OCR_ML4HLecture02image_.txt OCR | OCR_ML4HLecture04RepresentationLearning... 0.8628\n", "OCR | OCR_ML4HLecture04RepresentationLearning... OCR_academic_paper | OCR_PAPER_Hong et al. -... 0.8525\n", "OCR | OCR_ML4HLecture02image_.txt OCR_academic_paper | OCR_PAPER_dall-e-2-annot... 0.8415\n", "OCR | OCR_ML4HLecture02image_.txt OCR | OCR_ML4HLecture05-NLP.pptx_.txt 0.8389\n", "ASR | ASRnlp_law_lecture_week_3_part_1_v_2_c_... OCR | OCR_ML4HLecture05-NLP.pptx_.txt 0.8375\n", "ASR_cleaned | ASR-whisper-rpunctuated_Noam Ch... OCR | OCR_ML4HLecture05-NLP.pptx_.txt 0.8353\n", "\n", "-------------------------------------------------------------------------------------------------------------------\n", "Bottom 20 least similar pairs:\n", "adversarial | navy seals copy pasta.txt literature | The Most Dangerous Game--Richard... 0.6168\n", "OCR_academic_paper | OCR_PAPER_Hong et al. -... Script | script_frozendisney.txt 0.6094\n", "ASR | ASRnlp_law_lecture_week_2_v_2_c_transcr... Script | script_frozendisney.txt 0.6078\n", "OCR_academic_paper | OCR_PAPER_Kandpal, Nieto... Script | script_frozendisney.txt 0.6011\n", "OCR | OCR_ML4HLecture02image_.txt Script | script_frozendisney.txt 0.6006\n", "adversarial | navy seals copy pasta.txt Script | script_frozendisney.txt 0.5957\n", "adversarial | navy seals copy pasta.txt Script | script_sunsetblvd..txt 0.5930\n", "adversarial | navy seals copy pasta.txt Script | script_findingnemo.txt 0.5706\n", "adversarial | navy seals copy pasta.txt OCR | OCR_ML4HLecture05-NLP.pptx_.txt 0.5261\n", "adversarial | navy seals copy pasta.txt OCR | OCR_ML4HLecture02image_.txt 0.5065\n", "ASR | ASRnlp_law_lecture_week_2_v_2_c_transcr... adversarial | navy seals copy pasta.txt 0.4997\n", "ASR_cleaned | ASR-whisper-rpunctuated_Noam Ch... adversarial | navy seals copy pasta.txt 0.4992\n", "ASR | ASRnlp_law_lecture_week_1_v_2_c_transcr... adversarial | navy seals copy pasta.txt 0.4951\n", "adversarial | navy seals copy pasta.txt OCR_academic_paper | OCR_PAPER_dall-e-2-annot... 0.4885\n", "academic_paper | Emie_dissertation_cleansed.t... adversarial | navy seals copy pasta.txt 0.4884\n", "adversarial | navy seals copy pasta.txt OCR_academic_paper | OCR_PAPER_Kandpal, Nieto... 0.4715\n", "ASR | ASRnlp_law_lecture_week_3_part_1_v_2_c_... adversarial | navy seals copy pasta.txt 0.4700\n", "ASR_cleaned | ASR-whisper-rpunctuated_Noam Ch... adversarial | navy seals copy pasta.txt 0.4689\n", "adversarial | navy seals copy pasta.txt OCR_academic_paper | OCR_PAPER_Hong et al. -... 0.4554\n", "adversarial | navy seals copy pasta.txt OCR | OCR_ML4HLecture04RepresentationLearning... 0.4466\n" ] } ], "source": [ "import torch\n", "import torch.nn.functional as F\n", "\n", "# @title document pairs & similarity\n", "top_n_pairs = 20 # @param {type:\"slider\", min:5, max:30, step:5}\n", "\n", "df[\"doc_labels\"] = df.apply(\n", " lambda row: row[\"source_doc_domain\"] + \" | \" + row[\"source_doc_filename\"], axis=1\n", ")\n", "docs = df.doc_labels.tolist()\n", "\n", "cosine_scores_np = cosine_scores.cpu().numpy()\n", "\n", "# Find the pairs with the highest cosine similarity scores, excluding self-comparisons\n", "pairs = []\n", "for i in range(len(cosine_scores_np) - 1):\n", " for j in range(i + 1, len(cosine_scores_np)):\n", " pairs.append({\"index\": [i, j], \"score\": cosine_scores_np[i][j]})\n", "\n", "# Sort scores in decreasing order\n", "pairs.sort(key=lambda x: x[\"score\"], reverse=True)\n", "\n", "\n", "def sp(text: str, max_chars: int = 45):\n", " return (\n", " text.strip() if len(str(text)) < max_chars else text[:max_chars].strip() + \"...\"\n", " )\n", "\n", "\n", "def print_pairs(pairs, docs, title=\"Similar Document Pairs\", top_n=10):\n", " \"\"\"\n", " Prints specified number of top and bottom document pairs and their scores with improved formatting.\n", " \"\"\"\n", " print(f\"{title}:\")\n", " print(f\"{'Document 1':<50} {'Document 2':<50} {'Score':>10}\")\n", " print(\"-\" * 115)\n", "\n", " # Print top N pairs\n", " print(f\"Top {top_n} most similar pairs:\")\n", " for pair in pairs[:top_n]:\n", " i, j = pair[\"index\"]\n", " print(f\"{sp(docs[i]):<50} {sp(docs[j]):<50} {pair['score']:10.4f}\")\n", "\n", " print(\"\\n\" + \"-\" * 115)\n", "\n", " # Print bottom N pairs\n", " print(f\"Bottom {top_n} least similar pairs:\")\n", " for pair in pairs[-top_n:]:\n", " i, j = pair[\"index\"]\n", " print(f\"{sp(docs[i]):<50} {sp(docs[j]):<50} {pair['score']:10.4f}\")\n", "\n", "\n", "# Display the top 10 and bottom 10 pairs with improved formatting\n", "print(f\"There are {len(pairs)} total pairs between all docs\")\n", "print(f\"Results for:\\t{model_name}\\n\\n\")\n", "print_pairs(\n", " pairs,\n", " docs,\n", " title=\"Document Similarity Pairs (domain | filename)\",\n", " top_n=top_n_pairs,\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "vhcp0QYuZfcq", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "42dba600-8ae7-405f-c5a9-82dce8d3372f" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "SentenceTransformer(\n", " (0): Transformer({'max_seq_length': 4096, 'do_lower_case': False}) with Transformer model: BertModel \n", " (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})\n", ")" ] }, "metadata": {}, "execution_count": 12 } ], "source": [ "model" ] }, { "cell_type": "markdown", "source": [ "## bonus: summary embedding similarity\n", "\n", "\n", "> what if we do (almost) the same thing but use the summaries (generated by GPT-4)?" ], "metadata": { "id": "xleYhlIz7mLJ" } }, { "cell_type": "code", "source": [ "summaries = df.summary.to_list()\n", "summary_embeddings = model.encode(\n", " summaries,\n", " batch_size=8,\n", " show_progress_bar=True,\n", " convert_to_tensor=True,\n", " normalize_embeddings=True,\n", ")" ], "metadata": { "id": "LMsh6EotzHDN", "colab": { "base_uri": "https://localhost:8080/", "height": 49, "referenced_widgets": [ "3701db548f4e461f9d11c23910de5299", "c4b7f288aa72498db36218cb4dfcb707", "ebc6ee5d733848d3a89e8015429e3532", "71eb6c6c44f24d6ea3dbe671ec10b932", "c8d202ff50ad4496bfdf79dd06f19c8d", "cac9e967348145559899949844879c73", "1fd04299094443f0ad95ed4a90e15ea7", "2301910ad05741ff8b278eb9f875c093", "2b50c21c4aae4dedb839e28887f76c11", "faeebd3ed7e7403d807190f81d79b253", "8bb6e6924e9e4dc2afc61a02b09591b9" ] }, "outputId": "d007dd0f-da6a-4298-8234-f7c70ef42f81" }, "execution_count": null, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "Batches: 0%| | 0/3 [00:00