 {"id":510503,"date":"2024-06-24T21:19:00","date_gmt":"2024-06-25T04:19:00","guid":{"rendered":"https:\/\/jorgep.com\/blog\/?p=510503"},"modified":"2024-10-25T14:02:26","modified_gmt":"2024-10-25T21:02:26","slug":"what-is-llm-quantization","status":"publish","type":"post","link":"https:\/\/jorgep.com\/blog\/what-is-llm-quantization\/","title":{"rendered":"What is LLM Quantization?"},"content":{"rendered":"\n<div class=\"wp-block-columns has-theme-palette-7-background-color has-background is-layout-flex wp-container-core-columns-is-layout-9d6595d7 wp-block-columns-is-layout-flex\">\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\">\n<p>Part of: <strong> <a href=\"https:\/\/jorgep.com\/blog\/series-ai-learnings\/\">AI Learning Series Here<\/a><\/strong><\/p>\n\n\n<style>.kadence-column395113_43ef2d-d5 > .kt-inside-inner-col,.kadence-column395113_43ef2d-d5 > .kt-inside-inner-col:before{border-top-left-radius:0px;border-top-right-radius:0px;border-bottom-right-radius:0px;border-bottom-left-radius:0px;}.kadence-column395113_43ef2d-d5 > .kt-inside-inner-col{column-gap:var(--global-kb-gap-sm, 1rem);}.kadence-column395113_43ef2d-d5 > .kt-inside-inner-col{flex-direction:column;}.kadence-column395113_43ef2d-d5 > .kt-inside-inner-col > .aligncenter{width:100%;}.kadence-column395113_43ef2d-d5 > .kt-inside-inner-col:before{opacity:0.3;}.kadence-column395113_43ef2d-d5{position:relative;}@media all and (max-width: 1024px){.kadence-column395113_43ef2d-d5 > .kt-inside-inner-col{flex-direction:column;justify-content:center;}}@media all and (max-width: 767px){.kadence-column395113_43ef2d-d5 > .kt-inside-inner-col{flex-direction:column;justify-content:center;}}<\/style>\n<div class=\"wp-block-kadence-column kadence-column395113_43ef2d-d5\"><div class=\"kt-inside-inner-col\"><style>.wp-block-kadence-advancedheading.kt-adv-heading510545_6813a5-28, .wp-block-kadence-advancedheading.kt-adv-heading510545_6813a5-28[data-kb-block=\"kb-adv-heading510545_6813a5-28\"]{font-size:var(--global-kb-font-size-sm, 0.9rem);font-style:normal;}.wp-block-kadence-advancedheading.kt-adv-heading510545_6813a5-28 mark.kt-highlight, .wp-block-kadence-advancedheading.kt-adv-heading510545_6813a5-28[data-kb-block=\"kb-adv-heading510545_6813a5-28\"] mark.kt-highlight{font-style:normal;color:#f76a0c;-webkit-box-decoration-break:clone;box-decoration-break:clone;padding-top:0px;padding-right:0px;padding-bottom:0px;padding-left:0px;}.wp-block-kadence-advancedheading.kt-adv-heading510545_6813a5-28 img.kb-inline-image, .wp-block-kadence-advancedheading.kt-adv-heading510545_6813a5-28[data-kb-block=\"kb-adv-heading510545_6813a5-28\"] img.kb-inline-image{width:150px;vertical-align:baseline;}<\/style>\n<p class=\"kt-adv-heading510545_6813a5-28 wp-block-kadence-advancedheading\" data-kb-block=\"kb-adv-heading510545_6813a5-28\">Quick Links:&nbsp;<a href=\"https:\/\/jorgep.com\/blog\/resources-for-learning-ai\/\">Resources for Learning AI<\/a> | <a href=\"https:\/\/jorgep.com\/blog\/keeping-up-with-ai\/\">Keep up with AI<\/a> | <a href=\"https:\/\/jorgep.com\/blog\/list-of-ai-tools\/\" data-type=\"post\" data-id=\"402818\">List of AI Tools<\/a><\/p>\n<\/div><\/div>\n<\/div>\n\n\n\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\"><div class=\"wp-block-template-part\"><style>.wp-block-kadence-advancedheading.kt-adv-heading395113_c650df-47, .wp-block-kadence-advancedheading.kt-adv-heading395113_c650df-47[data-kb-block=\"kb-adv-heading395113_c650df-47\"]{text-align:center;font-size:var(--global-kb-font-size-md, 1.25rem);line-height:60px;font-style:normal;background-color:#f5a511;}.wp-block-kadence-advancedheading.kt-adv-heading395113_c650df-47 mark.kt-highlight, .wp-block-kadence-advancedheading.kt-adv-heading395113_c650df-47[data-kb-block=\"kb-adv-heading395113_c650df-47\"] mark.kt-highlight{font-style:normal;color:#f76a0c;-webkit-box-decoration-break:clone;box-decoration-break:clone;padding-top:0px;padding-right:0px;padding-bottom:0px;padding-left:0px;}.wp-block-kadence-advancedheading.kt-adv-heading395113_c650df-47 img.kb-inline-image, .wp-block-kadence-advancedheading.kt-adv-heading395113_c650df-47[data-kb-block=\"kb-adv-heading395113_c650df-47\"] img.kb-inline-image{width:150px;vertical-align:baseline;}<\/style>\n<p class=\"kt-adv-heading395113_c650df-47 wp-block-kadence-advancedheading\" data-kb-block=\"kb-adv-heading395113_c650df-47\">Subscribe to <a href=\"https:\/\/go.35s.be\/jtb\" target=\"_blank\" rel=\"noreferrer noopener\"><strong>JorgeTechBits  newsletter<\/strong><\/a><\/p>\n<\/div><\/div>\n<\/div>\n\n\n\n<div style=\"font-family: Verdana, Geneva, sans-serif; font-size: 11px; line-height: 1.6; color: #333;\">\n    <p>\n        <strong>Disclaimer:<\/strong> \n        <em>I personally love to share my learnings, thoughts, and ideas; I get great satisfaction knowing someone has read and benefited from an article. This content is created entirely on my own time and in a personal capacity. The views expressed here are mine alone and do not represent the positions or opinions of my employer.<\/em>\n    <\/p>\n    <p>\n        In my professional role, I serve as a Workforce Transformation Solutions Principal for \n        <a href=\"https:\/\/www.dell.com\/en-us\/work\/learn\/by-service-type-deployment\" style=\"color: #007db8; font-weight: bold; text-decoration: none;\">Dell Technology Services<\/a>. \n        I am passionate about guiding organizations through complex technology transitions and \n        <a href=\"https:\/\/www.delltechnologies.com\/en-us\/what-we-do\/workforce-transformation.htm\" style=\"color: #007db8; font-weight: bold; text-decoration: none;\">Workforce Transformation<\/a>. \n        <a href=\"https:\/\/www.delltechnologies.com\/en-us\/index.htm\" style=\"color: #007db8; font-weight: bold; text-decoration: none;\">Learn more at Dell Technologies<\/a>.\n    <\/p>\n    <hr style=\"border: 0; border-top: 1px solid #ddd; margin: 12px 0;\">\n<\/div>\n\n\n\n<p><\/p>\n\n\n\n<p>LLM quantization is the process of reducing the precision of numbers used to represent a model&#8217;s weights, typically converting from 32-bit floating point (FP32) to smaller formats like 8-bit integers (INT8) or 4-bit integers (INT4). Think of it like compressing a high-resolution image to a smaller file size while trying to maintain as much quality as possible.<\/p>\n\n\n\n<blockquote class=\"wp-block-quote is-layout-flow wp-block-quote-is-layout-flow\">\n<p><em>LLM Quantization is all about Making Large Language Models More Accessible!<\/em> PLease see <a href=\"https:\/\/jorgep.com\/blog\/what-is-llm-quantization\/\">Runnin LLMs on your Local Computer<\/a><\/p>\n<\/blockquote>\n\n\n\n<h2 class=\"wp-block-heading\">Why Quantize LLMs?<\/h2>\n\n\n\n<p>The primary benefits of quantization are dramatic reductions in model size, decreased memory usage, and improved inference speed. <strong>To put this in perspective: a 13-billion parameter model that would normally require 52GB of memory in full precision can run in as little as 6.5GB after quantization. <\/strong>This makes it possible to run powerful AI models on consumer-grade hardware.<\/p>\n\n\n\n<h2 class=\"wp-block-heading\">Common Quantization Methods<\/h2>\n\n\n\n<h3 class=\"wp-block-heading\">Post-Training Quantization (PTQ)<\/h3>\n\n\n\n<p>This is the simplest approach, applied after a model is fully trained. Think of it as an after-market modification &#8211; you take an existing model and compress it. While this method is quick to implement, it may result in larger accuracy drops compared to other methods.<\/p>\n\n\n\n<h3 class=\"wp-block-heading\">Quantization-Aware Training (QAT)<\/h3>\n\n\n\n<p>QAT builds quantization into the training process itself. Imagine teaching someone to whisper effectively instead of asking them to lower their voice after they&#8217;ve learned to speak. While this method requires full retraining, it typically preserves more of the model&#8217;s accuracy.<\/p>\n\n\n\n<h3 class=\"wp-block-heading\">Dynamic Quantization<\/h3>\n\n\n\n<p>This method quantizes weights on-the-fly during inference while keeping activations in full precision. It&#8217;s like having a just-in-time compression system, offering more flexibility but using more memory than static approaches.<\/p>\n\n\n\n<h2 class=\"wp-block-heading\">Precision Levels and Their Impact<\/h2>\n\n\n\n<p>Starting with FP32 (full precision), each step down in precision brings greater resource savings but potential accuracy costs:<\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li>FP16: Cuts size in half with minimal accuracy loss<\/li>\n\n\n\n<li>INT8: Reduces size to 1\/4 with noticeable but often acceptable impact<\/li>\n\n\n\n<li>INT4: Shrinks to 1\/8 the size with larger accuracy trade-offs<\/li>\n<\/ul>\n\n\n\n<h2 class=\"wp-block-heading\">Popular Implementation Approaches<\/h2>\n\n\n\n<h3 class=\"wp-block-heading\">GGML<\/h3>\n\n\n\n<p>Optimized specifically for CPU inference, GGML has become a favorite in the open-source community, particularly through its use in llama.cpp. It offers multiple quantization formats and is highly optimized for consumer hardware.<\/p>\n\n\n\n<h3 class=\"wp-block-heading\">AWQ (Advanced Weight Quantization)<\/h3>\n\n\n\n<p>This newer approach focuses on preserving accuracy while achieving high compression rates. It&#8217;s particularly popular in local deployment scenarios where both size and performance matter.<\/p>\n\n\n\n<h3 class=\"wp-block-heading\">GPTQ<\/h3>\n\n\n\n<p>Known for its group-wise quantization approach, GPTQ strikes a good balance between speed and accuracy. It&#8217;s widely adopted in the community and has proven especially effective for larger models.<\/p>\n\n\n\n<h2 class=\"wp-block-heading\">Best Practices for Implementation<\/h2>\n\n\n\n<p>Start by matching your quantization strategy to your use case:<\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li>For models over 13B parameters, consider 4-bit quantization as your starting point<\/li>\n\n\n\n<li>Critical applications might require higher precision<\/li>\n\n\n\n<li>Test thoroughly on your specific use cases before deployment<\/li>\n\n\n\n<li>Monitor both performance and accuracy metrics<\/li>\n<\/ul>\n\n\n\n<p>Last note: Quantization isn&#8217;t just about making models smaller &#8211; it&#8217;s about making AI more accessible and practical for real-world applications. The key is finding the right balance between model performance and resource constraints for your specific needs.<\/p>\n\n\n\n<h2 class=\"wp-block-heading\">See Also: <\/h2>\n\n\n\n<ul class=\"wp-block-list\">\n<li><a href=\"https:\/\/jorgep.com\/blog\/what-are-large-language-models-llm\/\">What Are Large Language Models (LLM) <\/a><\/li>\n\n\n\n<li><a href=\"https:\/\/jorgep.com\/blog\/chatgpt-is-not-an-llm\/\" data-type=\"post\" data-id=\"464237\">Chatbots are not LLMS<\/a><\/li>\n\n\n\n<li><a href=\"https:\/\/jorgep.com\/blog\/create-a-custom-ai-chatbot-using-your-own-data\/\" data-type=\"post\" data-id=\"430244\">Creating a ChatBot using YOUR data<\/a><\/li>\n\n\n\n<li><a href=\"https:\/\/jorgep.com\/blog\/copilot-studio-create-your-own-copilot-with-no-code\/\" data-type=\"post\" data-id=\"462663\">Copilot Studio \u2013 Create your own Copilot with No Code <\/a><\/li>\n<\/ul>\n\n\n\n<p><\/p>\n","protected":false},"excerpt":{"rendered":"<p>LLM quantization is the process of reducing the precision of numbers used to represent a model&#8217;s weights, typically converting from 32-bit floating point (FP32) to smaller formats like 8-bit integers (INT8) or 4-bit integers (INT4). Think of it like compressing a high-resolution image to a smaller file size while trying to maintain as much quality&#8230;<\/p>\n","protected":false},"author":2,"featured_media":427864,"comment_status":"closed","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"_kad_blocks_custom_css":"","_kad_blocks_head_custom_js":"","_kad_blocks_body_custom_js":"","_kad_blocks_footer_custom_js":"","ngg_post_thumbnail":0,"episode_type":"","audio_file":"","podmotor_file_id":"","podmotor_episode_id":"","cover_image":"","cover_image_id":"","duration":"","filesize":"","filesize_raw":"","date_recorded":"","explicit":"","block":"","itunes_episode_number":"","itunes_title":"","itunes_season_number":"","itunes_episode_type":"","_kad_post_transparent":"","_kad_post_title":"","_kad_post_layout":"","_kad_post_sidebar_id":"","_kad_post_content_style":"","_kad_post_vertical_padding":"","_kad_post_feature":"","_kad_post_feature_position":"","_kad_post_header":false,"_kad_post_footer":false,"_kad_post_classname":"","footnotes":""},"categories":[441],"tags":[471,930,842,871,876],"class_list":["post-510503","post","type-post","status-publish","format-standard","has-post-thumbnail","hentry","category-tech-talk","tag-ai","tag-ai-series","tag-chatgpt","tag-genai","tag-llm"],"taxonomy_info":{"category":[{"value":441,"label":"Tech Talk"}],"post_tag":[{"value":471,"label":"AI"},{"value":930,"label":"AI Series"},{"value":842,"label":"ChatGPT"},{"value":871,"label":"GenAi"},{"value":876,"label":"LLM"}]},"featured_image_src_large":["https:\/\/jorgep.com\/blog\/wp-content\/uploads\/FeaturedImage-Topic-AI-1024x512.png",1024,512,true],"author_info":{"display_name":"Jorge Pereira","author_link":"https:\/\/jorgep.com\/blog\/author\/jorge\/"},"comment_info":0,"category_info":[{"term_id":441,"name":"Tech Talk","slug":"tech-talk","term_group":0,"term_taxonomy_id":451,"taxonomy":"category","description":"","parent":0,"count":677,"filter":"raw","cat_ID":441,"category_count":677,"category_description":"","cat_name":"Tech Talk","category_nicename":"tech-talk","category_parent":0}],"tag_info":[{"term_id":471,"name":"AI","slug":"ai","term_group":0,"term_taxonomy_id":481,"taxonomy":"post_tag","description":"","parent":0,"count":146,"filter":"raw"},{"term_id":930,"name":"AI Series","slug":"ai-series","term_group":0,"term_taxonomy_id":940,"taxonomy":"post_tag","description":"","parent":0,"count":151,"filter":"raw"},{"term_id":842,"name":"ChatGPT","slug":"chatgpt","term_group":0,"term_taxonomy_id":852,"taxonomy":"post_tag","description":"","parent":0,"count":19,"filter":"raw"},{"term_id":871,"name":"GenAi","slug":"genai","term_group":0,"term_taxonomy_id":881,"taxonomy":"post_tag","description":"","parent":0,"count":83,"filter":"raw"},{"term_id":876,"name":"LLM","slug":"llm","term_group":0,"term_taxonomy_id":886,"taxonomy":"post_tag","description":"","parent":0,"count":17,"filter":"raw"}],"_links":{"self":[{"href":"https:\/\/jorgep.com\/blog\/wp-json\/wp\/v2\/posts\/510503","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/jorgep.com\/blog\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/jorgep.com\/blog\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/jorgep.com\/blog\/wp-json\/wp\/v2\/users\/2"}],"replies":[{"embeddable":true,"href":"https:\/\/jorgep.com\/blog\/wp-json\/wp\/v2\/comments?post=510503"}],"version-history":[{"count":0,"href":"https:\/\/jorgep.com\/blog\/wp-json\/wp\/v2\/posts\/510503\/revisions"}],"wp:featuredmedia":[{"embeddable":true,"href":"https:\/\/jorgep.com\/blog\/wp-json\/wp\/v2\/media\/427864"}],"wp:attachment":[{"href":"https:\/\/jorgep.com\/blog\/wp-json\/wp\/v2\/media?parent=510503"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/jorgep.com\/blog\/wp-json\/wp\/v2\/categories?post=510503"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/jorgep.com\/blog\/wp-json\/wp\/v2\/tags?post=510503"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}