 {"id":510503,"date":"2024-06-24T21:19:00","date_gmt":"2024-06-25T04:19:00","guid":{"rendered":"https:\/\/jorgep.com\/blog\/?p=510503"},"modified":"2024-10-25T14:02:26","modified_gmt":"2024-10-25T21:02:26","slug":"what-is-llm-quantization","status":"publish","type":"post","link":"https:\/\/jorgep.com\/blog\/what-is-llm-quantization\/","title":{"rendered":"What is LLM Quantization?"},"content":{"rendered":"\n<div class=\"wp-block-columns has-theme-palette-7-background-color has-background is-layout-flex wp-container-core-columns-is-layout-2edb0647 wp-block-columns-is-layout-flex\" style=\"margin-top:0;margin-bottom:0\">\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\" style=\"flex-basis:80%\">\n<p>Part of: <strong> <a href=\"https:\/\/jorgep.com\/blog\/series-ai-learnings\/\">AI Learning Series Here<\/a><\/strong><\/p>\n\n\n<style>.kadence-column395113_97b87a-23 > .kt-inside-inner-col,.kadence-column395113_97b87a-23 > .kt-inside-inner-col:before{border-top-left-radius:0px;border-top-right-radius:0px;border-bottom-right-radius:0px;border-bottom-left-radius:0px;}.kadence-column395113_97b87a-23 > .kt-inside-inner-col{column-gap:var(--global-kb-gap-sm, 1rem);}.kadence-column395113_97b87a-23 > .kt-inside-inner-col{flex-direction:column;}.kadence-column395113_97b87a-23 > .kt-inside-inner-col > .aligncenter{width:100%;}.kadence-column395113_97b87a-23 > .kt-inside-inner-col:before{opacity:0.3;}.kadence-column395113_97b87a-23{position:relative;}@media all and (max-width: 1024px){.kadence-column395113_97b87a-23 > .kt-inside-inner-col{flex-direction:column;justify-content:center;}}@media all and (max-width: 767px){.kadence-column395113_97b87a-23 > .kt-inside-inner-col{flex-direction:column;justify-content:center;}}<\/style>\n<div class=\"wp-block-kadence-column kadence-column395113_97b87a-23\"><div class=\"kt-inside-inner-col\"><style>.wp-block-kadence-advancedheading.kt-adv-heading510545_15a085-99, .wp-block-kadence-advancedheading.kt-adv-heading510545_15a085-99[data-kb-block=\"kb-adv-heading510545_15a085-99\"]{font-size:var(--global-kb-font-size-sm, 0.9rem);font-style:normal;}.wp-block-kadence-advancedheading.kt-adv-heading510545_15a085-99 mark.kt-highlight, .wp-block-kadence-advancedheading.kt-adv-heading510545_15a085-99[data-kb-block=\"kb-adv-heading510545_15a085-99\"] mark.kt-highlight{font-style:normal;color:#f76a0c;-webkit-box-decoration-break:clone;box-decoration-break:clone;padding-top:0px;padding-right:0px;padding-bottom:0px;padding-left:0px;}.wp-block-kadence-advancedheading.kt-adv-heading510545_15a085-99 img.kb-inline-image, .wp-block-kadence-advancedheading.kt-adv-heading510545_15a085-99[data-kb-block=\"kb-adv-heading510545_15a085-99\"] img.kb-inline-image{width:150px;vertical-align:baseline;}<\/style>\n<p class=\"kt-adv-heading510545_15a085-99 wp-block-kadence-advancedheading\" data-kb-block=\"kb-adv-heading510545_15a085-99\">Quick Links:\u00a0<a href=\"https:\/\/jorgep.com\/blog\/resources-for-learning-ai\/\">Resources for Learning AI<\/a> | <a href=\"https:\/\/jorgep.com\/blog\/keeping-up-with-ai\/\">Keep up with AI<\/a> | <a href=\"https:\/\/jorgep.com\/blog\/list-of-ai-tools\/\" data-type=\"post\" data-id=\"402818\">List of AI Tools<\/a> | <a href=\"https:\/\/jorgep.com\/blog\/local-ai-series\/\" data-type=\"page\" data-id=\"519365\">Local AI<\/a><\/p>\n<\/div><\/div>\n\n\n<style>.kb-row-layout-id395113_d73e95-0d > .kt-row-column-wrap{align-content:start;}:where(.kb-row-layout-id395113_d73e95-0d > .kt-row-column-wrap) > .wp-block-kadence-column{justify-content:start;}.kb-row-layout-id395113_d73e95-0d > .kt-row-column-wrap{column-gap:var(--global-kb-gap-md, 2rem);row-gap:var(--global-kb-gap-none, 0rem );padding-top:var(--global-kb-spacing-xxs, 0.5rem);padding-bottom:var(--global-kb-spacing-xxs, 0.5rem);grid-template-columns:repeat(2, minmax(0, 1fr));}.kb-row-layout-id395113_d73e95-0d > .kt-row-layout-overlay{opacity:0.30;}@media all and (max-width: 1024px){.kb-row-layout-id395113_d73e95-0d > .kt-row-column-wrap{grid-template-columns:repeat(2, minmax(0, 1fr));}}@media all and (max-width: 767px){.kb-row-layout-id395113_d73e95-0d > .kt-row-column-wrap{grid-template-columns:minmax(0, 1fr);}}<\/style><div class=\"kb-row-layout-wrap kb-row-layout-id395113_d73e95-0d alignnone wp-block-kadence-rowlayout\"><div class=\"kt-row-column-wrap kt-has-2-columns kt-row-layout-equal kt-tab-layout-inherit kt-mobile-layout-row kt-row-valign-top\">\n<style>.kadence-column395113_df36f9-de > .kt-inside-inner-col,.kadence-column395113_df36f9-de > .kt-inside-inner-col:before{border-top-left-radius:0px;border-top-right-radius:0px;border-bottom-right-radius:0px;border-bottom-left-radius:0px;}.kadence-column395113_df36f9-de > .kt-inside-inner-col{column-gap:var(--global-kb-gap-sm, 1rem);}.kadence-column395113_df36f9-de > .kt-inside-inner-col{flex-direction:column;}.kadence-column395113_df36f9-de > .kt-inside-inner-col > .aligncenter{width:100%;}.kadence-column395113_df36f9-de > .kt-inside-inner-col:before{opacity:0.3;}.kadence-column395113_df36f9-de{position:relative;}@media all and (max-width: 1024px){.kadence-column395113_df36f9-de > .kt-inside-inner-col{flex-direction:column;justify-content:center;}}@media all and (max-width: 767px){.kadence-column395113_df36f9-de > .kt-inside-inner-col{flex-direction:column;justify-content:center;}}<\/style>\n<div class=\"wp-block-kadence-column kadence-column395113_df36f9-de\"><div class=\"kt-inside-inner-col\"><style>.wp-block-kadence-advancedheading.kt-adv-heading395113_b3212c-b9, .wp-block-kadence-advancedheading.kt-adv-heading395113_b3212c-b9[data-kb-block=\"kb-adv-heading395113_b3212c-b9\"]{text-align:center;font-size:var(--global-kb-font-size-sm, 0.9rem);line-height:60px;font-style:normal;background-color:#f5a511;}.wp-block-kadence-advancedheading.kt-adv-heading395113_b3212c-b9 mark.kt-highlight, .wp-block-kadence-advancedheading.kt-adv-heading395113_b3212c-b9[data-kb-block=\"kb-adv-heading395113_b3212c-b9\"] mark.kt-highlight{font-style:normal;color:#f76a0c;-webkit-box-decoration-break:clone;box-decoration-break:clone;padding-top:0px;padding-right:0px;padding-bottom:0px;padding-left:0px;}.wp-block-kadence-advancedheading.kt-adv-heading395113_b3212c-b9 img.kb-inline-image, .wp-block-kadence-advancedheading.kt-adv-heading395113_b3212c-b9[data-kb-block=\"kb-adv-heading395113_b3212c-b9\"] img.kb-inline-image{width:150px;vertical-align:baseline;}<\/style>\n<p class=\"kt-adv-heading395113_b3212c-b9 wp-block-kadence-advancedheading\" data-kb-block=\"kb-adv-heading395113_b3212c-b9\">Subscribe to <a href=\"https:\/\/go.35s.be\/jtb\" target=\"_blank\" rel=\"noreferrer noopener\"><strong>JorgeTechBits  newsletter<\/strong><\/a><\/p>\n<\/div><\/div>\n\n\n<style>.kadence-column395113_4b4b81-29 > .kt-inside-inner-col,.kadence-column395113_4b4b81-29 > .kt-inside-inner-col:before{border-top-left-radius:0px;border-top-right-radius:0px;border-bottom-right-radius:0px;border-bottom-left-radius:0px;}.kadence-column395113_4b4b81-29 > .kt-inside-inner-col{column-gap:var(--global-kb-gap-sm, 1rem);}.kadence-column395113_4b4b81-29 > .kt-inside-inner-col{flex-direction:column;}.kadence-column395113_4b4b81-29 > .kt-inside-inner-col > .aligncenter{width:100%;}.kadence-column395113_4b4b81-29 > .kt-inside-inner-col:before{opacity:0.3;}.kadence-column395113_4b4b81-29{position:relative;}@media all and (max-width: 1024px){.kadence-column395113_4b4b81-29 > .kt-inside-inner-col{flex-direction:column;justify-content:center;}}@media all and (max-width: 767px){.kadence-column395113_4b4b81-29 > .kt-inside-inner-col{flex-direction:column;justify-content:center;}}<\/style>\n<div class=\"wp-block-kadence-column kadence-column395113_4b4b81-29\"><div class=\"kt-inside-inner-col\"><\/div><\/div>\n\n<\/div><\/div><\/div>\n\n\n\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\"><div class=\"wp-block-image\">\n<figure class=\"aligncenter size-large is-resized\"><a href=\"htthttps:\/\/jorgep.com\/blog\/book-dont-just-chat-delegate\/\"><img loading=\"lazy\" decoding=\"async\" width=\"640\" height=\"1024\" src=\"https:\/\/jorgep.com\/blog\/wp-content\/uploads\/CoverBook-01-640x1024.jpg\" alt=\"\" class=\"wp-image-520234\" style=\"aspect-ratio:0.6250142320391666;width:98px;height:auto\" srcset=\"https:\/\/jorgep.com\/blog\/wp-content\/uploads\/CoverBook-01-640x1024.jpg 640w, https:\/\/jorgep.com\/blog\/wp-content\/uploads\/CoverBook-01-188x300.jpg 188w, https:\/\/jorgep.com\/blog\/wp-content\/uploads\/CoverBook-01-768x1229.jpg 768w, https:\/\/jorgep.com\/blog\/wp-content\/uploads\/CoverBook-01-960x1536.jpg 960w, https:\/\/jorgep.com\/blog\/wp-content\/uploads\/CoverBook-01-1280x2048.jpg 1280w, https:\/\/jorgep.com\/blog\/wp-content\/uploads\/CoverBook-01.jpg 1600w\" sizes=\"auto, (max-width: 640px) 100vw, 640px\" \/><\/a><figcaption class=\"wp-element-caption\"><a href=\"https:\/\/jorgep.com\/blog\/book-series-ai-dont-just-chat\/\" data-type=\"page\" data-id=\"520242\">Check out the Book Series<\/a><\/figcaption><\/figure>\n<\/div><\/div>\n<\/div>\n\n\n<style>.wp-block-kadence-advancedheading.kt-adv-heading407818_afcbba-c7, .wp-block-kadence-advancedheading.kt-adv-heading407818_afcbba-c7[data-kb-block=\"kb-adv-heading407818_afcbba-c7\"]{font-size:var(--global-kb-font-size-sm, 0.9rem);font-style:normal;}.wp-block-kadence-advancedheading.kt-adv-heading407818_afcbba-c7 mark.kt-highlight, .wp-block-kadence-advancedheading.kt-adv-heading407818_afcbba-c7[data-kb-block=\"kb-adv-heading407818_afcbba-c7\"] mark.kt-highlight{font-style:normal;color:#f76a0c;-webkit-box-decoration-break:clone;box-decoration-break:clone;padding-top:0px;padding-right:0px;padding-bottom:0px;padding-left:0px;}.wp-block-kadence-advancedheading.kt-adv-heading407818_afcbba-c7 img.kb-inline-image, .wp-block-kadence-advancedheading.kt-adv-heading407818_afcbba-c7[data-kb-block=\"kb-adv-heading407818_afcbba-c7\"] img.kb-inline-image{width:150px;vertical-align:baseline;}<\/style>\n<p class=\"kt-adv-heading407818_afcbba-c7 wp-block-kadence-advancedheading\" data-kb-block=\"kb-adv-heading407818_afcbba-c7\">Have questions, ideas to share, or just want to connect? I\u2019d love to hear from you! Check out my <a href=\"https:\/\/jorgep.com\/blog\/about\/\">About Page<\/a> to learn more about me or connect with me.<\/p>\n\n\n\n<p><\/p>\n\n\n\n<p>LLM quantization is the process of reducing the precision of numbers used to represent a model&#8217;s weights, typically converting from 32-bit floating point (FP32) to smaller formats like 8-bit integers (INT8) or 4-bit integers (INT4). Think of it like compressing a high-resolution image to a smaller file size while trying to maintain as much quality as possible.<\/p>\n\n\n\n<blockquote class=\"wp-block-quote is-layout-flow wp-block-quote-is-layout-flow\">\n<p><em>LLM Quantization is all about Making Large Language Models More Accessible!<\/em> PLease see <a href=\"https:\/\/jorgep.com\/blog\/what-is-llm-quantization\/\">Runnin LLMs on your Local Computer<\/a><\/p>\n<\/blockquote>\n\n\n\n<h2 class=\"wp-block-heading\">Why Quantize LLMs?<\/h2>\n\n\n\n<p>The primary benefits of quantization are dramatic reductions in model size, decreased memory usage, and improved inference speed. <strong>To put this in perspective: a 13-billion parameter model that would normally require 52GB of memory in full precision can run in as little as 6.5GB after quantization. <\/strong>This makes it possible to run powerful AI models on consumer-grade hardware.<\/p>\n\n\n\n<h2 class=\"wp-block-heading\">Common Quantization Methods<\/h2>\n\n\n\n<h3 class=\"wp-block-heading\">Post-Training Quantization (PTQ)<\/h3>\n\n\n\n<p>This is the simplest approach, applied after a model is fully trained. Think of it as an after-market modification &#8211; you take an existing model and compress it. While this method is quick to implement, it may result in larger accuracy drops compared to other methods.<\/p>\n\n\n\n<h3 class=\"wp-block-heading\">Quantization-Aware Training (QAT)<\/h3>\n\n\n\n<p>QAT builds quantization into the training process itself. Imagine teaching someone to whisper effectively instead of asking them to lower their voice after they&#8217;ve learned to speak. While this method requires full retraining, it typically preserves more of the model&#8217;s accuracy.<\/p>\n\n\n\n<h3 class=\"wp-block-heading\">Dynamic Quantization<\/h3>\n\n\n\n<p>This method quantizes weights on-the-fly during inference while keeping activations in full precision. It&#8217;s like having a just-in-time compression system, offering more flexibility but using more memory than static approaches.<\/p>\n\n\n\n<h2 class=\"wp-block-heading\">Precision Levels and Their Impact<\/h2>\n\n\n\n<p>Starting with FP32 (full precision), each step down in precision brings greater resource savings but potential accuracy costs:<\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li>FP16: Cuts size in half with minimal accuracy loss<\/li>\n\n\n\n<li>INT8: Reduces size to 1\/4 with noticeable but often acceptable impact<\/li>\n\n\n\n<li>INT4: Shrinks to 1\/8 the size with larger accuracy trade-offs<\/li>\n<\/ul>\n\n\n\n<h2 class=\"wp-block-heading\">Popular Implementation Approaches<\/h2>\n\n\n\n<h3 class=\"wp-block-heading\">GGML<\/h3>\n\n\n\n<p>Optimized specifically for CPU inference, GGML has become a favorite in the open-source community, particularly through its use in llama.cpp. It offers multiple quantization formats and is highly optimized for consumer hardware.<\/p>\n\n\n\n<h3 class=\"wp-block-heading\">AWQ (Advanced Weight Quantization)<\/h3>\n\n\n\n<p>This newer approach focuses on preserving accuracy while achieving high compression rates. It&#8217;s particularly popular in local deployment scenarios where both size and performance matter.<\/p>\n\n\n\n<h3 class=\"wp-block-heading\">GPTQ<\/h3>\n\n\n\n<p>Known for its group-wise quantization approach, GPTQ strikes a good balance between speed and accuracy. It&#8217;s widely adopted in the community and has proven especially effective for larger models.<\/p>\n\n\n\n<h2 class=\"wp-block-heading\">Best Practices for Implementation<\/h2>\n\n\n\n<p>Start by matching your quantization strategy to your use case:<\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li>For models over 13B parameters, consider 4-bit quantization as your starting point<\/li>\n\n\n\n<li>Critical applications might require higher precision<\/li>\n\n\n\n<li>Test thoroughly on your specific use cases before deployment<\/li>\n\n\n\n<li>Monitor both performance and accuracy metrics<\/li>\n<\/ul>\n\n\n\n<p>Last note: Quantization isn&#8217;t just about making models smaller &#8211; it&#8217;s about making AI more accessible and practical for real-world applications. The key is finding the right balance between model performance and resource constraints for your specific needs.<\/p>\n\n\n\n<h2 class=\"wp-block-heading\">See Also: <\/h2>\n\n\n\n<ul class=\"wp-block-list\">\n<li><a href=\"https:\/\/jorgep.com\/blog\/what-are-large-language-models-llm\/\">What Are Large Language Models (LLM) <\/a><\/li>\n\n\n\n<li><a href=\"https:\/\/jorgep.com\/blog\/chatgpt-is-not-an-llm\/\" data-type=\"post\" data-id=\"464237\">Chatbots are not LLMS<\/a><\/li>\n\n\n\n<li><a href=\"https:\/\/jorgep.com\/blog\/create-a-custom-ai-chatbot-using-your-own-data\/\" data-type=\"post\" data-id=\"430244\">Creating a ChatBot using YOUR data<\/a><\/li>\n\n\n\n<li><a href=\"https:\/\/jorgep.com\/blog\/copilot-studio-create-your-own-copilot-with-no-code\/\" data-type=\"post\" data-id=\"462663\">Copilot Studio \u2013 Create your own Copilot with No Code <\/a><\/li>\n<\/ul>\n\n\n\n<p><\/p>\n","protected":false},"excerpt":{"rendered":"<p>LLM quantization is the process of reducing the precision of numbers used to represent a model&#8217;s weights, typically converting from 32-bit floating point (FP32) to smaller formats like 8-bit integers (INT8) or 4-bit integers (INT4). Think of it like compressing a high-resolution image to a smaller file size while trying to maintain as much quality&#8230;<\/p>\n","protected":false},"author":2,"featured_media":427864,"comment_status":"closed","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"_kad_blocks_custom_css":"","_kad_blocks_head_custom_js":"","_kad_blocks_body_custom_js":"","_kad_blocks_footer_custom_js":"","ngg_post_thumbnail":0,"_kad_post_transparent":"","_kad_post_title":"","_kad_post_layout":"","_kad_post_sidebar_id":"","_kad_post_content_style":"","_kad_post_vertical_padding":"","_kad_post_feature":"","_kad_post_feature_position":"","_kad_post_header":false,"_kad_post_footer":false,"_kad_post_classname":"","footnotes":""},"categories":[441],"tags":[471,930,842,871,876],"class_list":["post-510503","post","type-post","status-publish","format-standard","has-post-thumbnail","hentry","category-tech-talk","tag-ai","tag-ai-series","tag-chatgpt","tag-genai","tag-llm"],"taxonomy_info":{"category":[{"value":441,"label":"Tech Talk"}],"post_tag":[{"value":471,"label":"AI"},{"value":930,"label":"AI Series"},{"value":842,"label":"ChatGPT"},{"value":871,"label":"GenAi"},{"value":876,"label":"LLM"}]},"featured_image_src_large":["https:\/\/jorgep.com\/blog\/wp-content\/uploads\/FeaturedImage-Topic-AI-1024x512.png",1024,512,true],"author_info":{"display_name":"Jorge Pereira","author_link":"https:\/\/jorgep.com\/blog\/author\/jorge\/"},"comment_info":0,"category_info":[{"term_id":441,"name":"Tech Talk","slug":"tech-talk","term_group":0,"term_taxonomy_id":451,"taxonomy":"category","description":"","parent":0,"count":697,"filter":"raw","cat_ID":441,"category_count":697,"category_description":"","cat_name":"Tech Talk","category_nicename":"tech-talk","category_parent":0}],"tag_info":[{"term_id":471,"name":"AI","slug":"ai","term_group":0,"term_taxonomy_id":481,"taxonomy":"post_tag","description":"","parent":0,"count":159,"filter":"raw"},{"term_id":930,"name":"AI Series","slug":"ai-series","term_group":0,"term_taxonomy_id":940,"taxonomy":"post_tag","description":"","parent":0,"count":165,"filter":"raw"},{"term_id":842,"name":"ChatGPT","slug":"chatgpt","term_group":0,"term_taxonomy_id":852,"taxonomy":"post_tag","description":"","parent":0,"count":19,"filter":"raw"},{"term_id":871,"name":"GenAi","slug":"genai","term_group":0,"term_taxonomy_id":881,"taxonomy":"post_tag","description":"","parent":0,"count":86,"filter":"raw"},{"term_id":876,"name":"LLM","slug":"llm","term_group":0,"term_taxonomy_id":886,"taxonomy":"post_tag","description":"","parent":0,"count":19,"filter":"raw"}],"_links":{"self":[{"href":"https:\/\/jorgep.com\/blog\/wp-json\/wp\/v2\/posts\/510503","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/jorgep.com\/blog\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/jorgep.com\/blog\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/jorgep.com\/blog\/wp-json\/wp\/v2\/users\/2"}],"replies":[{"embeddable":true,"href":"https:\/\/jorgep.com\/blog\/wp-json\/wp\/v2\/comments?post=510503"}],"version-history":[{"count":0,"href":"https:\/\/jorgep.com\/blog\/wp-json\/wp\/v2\/posts\/510503\/revisions"}],"wp:featuredmedia":[{"embeddable":true,"href":"https:\/\/jorgep.com\/blog\/wp-json\/wp\/v2\/media\/427864"}],"wp:attachment":[{"href":"https:\/\/jorgep.com\/blog\/wp-json\/wp\/v2\/media?parent=510503"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/jorgep.com\/blog\/wp-json\/wp\/v2\/categories?post=510503"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/jorgep.com\/blog\/wp-json\/wp\/v2\/tags?post=510503"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}