{"id":520609,"date":"2026-05-06T22:15:18","date_gmt":"2026-05-07T05:15:18","guid":{"rendered":"https:\/\/jorgep.com\/blog\/?p=520609"},"modified":"2026-06-06T09:31:18","modified_gmt":"2026-06-06T16:31:18","slug":"small-local-models-why-tiny-ai-is-having-a-big-moment","status":"publish","type":"post","link":"https:\/\/jorgep.com\/blog\/small-local-models-why-tiny-ai-is-having-a-big-moment\/","title":{"rendered":"Small Local Models: Why Tiny AI Is Having a Big Moment"},"content":{"rendered":"\n<div class=\"wp-block-columns has-theme-palette-7-background-color has-background is-layout-flex wp-container-core-columns-is-layout-5dc627e1 wp-block-columns-is-layout-flex\" style=\"margin-top:0;margin-bottom:0\">\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\" style=\"flex-basis:80%\">\n<p class=\"wp-block-paragraph\">Part of: <strong> <a href=\"https:\/\/jorgep.com\/blog\/series-ai-learnings\/\">AI Learning Series Here<\/a><\/strong><\/p>\n\n\n<style>.kadence-column395113_97b87a-23 > .kt-inside-inner-col,.kadence-column395113_97b87a-23 > .kt-inside-inner-col:before{border-top-left-radius:0px;border-top-right-radius:0px;border-bottom-right-radius:0px;border-bottom-left-radius:0px;}.kadence-column395113_97b87a-23 > .kt-inside-inner-col{column-gap:var(--global-kb-gap-sm, 1rem);}.kadence-column395113_97b87a-23 > .kt-inside-inner-col{flex-direction:column;}.kadence-column395113_97b87a-23 > .kt-inside-inner-col > .aligncenter{width:100%;}.kadence-column395113_97b87a-23 > .kt-inside-inner-col:before{opacity:0.3;}.kadence-column395113_97b87a-23{position:relative;}@media all and (max-width: 1024px){.kadence-column395113_97b87a-23 > .kt-inside-inner-col{flex-direction:column;justify-content:center;}}@media all and (max-width: 767px){.kadence-column395113_97b87a-23 > .kt-inside-inner-col{flex-direction:column;justify-content:center;}}<\/style>\n<div class=\"wp-block-kadence-column kadence-column395113_97b87a-23\"><div class=\"kt-inside-inner-col\"><style>.kadence-column510545_44b637-b5 > .kt-inside-inner-col{padding-top:var(--global-kb-spacing-xs, 1rem);padding-bottom:var(--global-kb-spacing-xs, 1rem);}.kadence-column510545_44b637-b5 > .kt-inside-inner-col,.kadence-column510545_44b637-b5 > .kt-inside-inner-col:before{border-top-left-radius:0px;border-top-right-radius:0px;border-bottom-right-radius:0px;border-bottom-left-radius:0px;}.kadence-column510545_44b637-b5 > .kt-inside-inner-col{column-gap:var(--global-kb-gap-sm, 1rem);}.kadence-column510545_44b637-b5 > .kt-inside-inner-col{flex-direction:column;}.kadence-column510545_44b637-b5 > .kt-inside-inner-col > .aligncenter{width:100%;}.kadence-column510545_44b637-b5 > .kt-inside-inner-col{background-color:var(--global-palette7, #EDF2F7);}.kadence-column510545_44b637-b5:hover > .kt-inside-inner-col{background-color:var(--global-palette8, #F7FAFC);background-image:none;}.kadence-column510545_44b637-b5 > .kt-inside-inner-col:before{opacity:0.3;}.kadence-column510545_44b637-b5{position:relative;}@media all and (max-width: 1024px){.kadence-column510545_44b637-b5 > .kt-inside-inner-col{flex-direction:column;justify-content:center;}}@media all and (max-width: 767px){.kadence-column510545_44b637-b5 > .kt-inside-inner-col{flex-direction:column;justify-content:center;}}<\/style>\n<div class=\"wp-block-kadence-column kadence-column510545_44b637-b5\"><div class=\"kt-inside-inner-col\"><style>.wp-block-kadence-advancedheading.kt-adv-heading510545_f7c5f8-ed, .wp-block-kadence-advancedheading.kt-adv-heading510545_f7c5f8-ed[data-kb-block=\"kb-adv-heading510545_f7c5f8-ed\"]{text-align:center;font-size:var(--global-kb-font-size-md, 1.25rem);font-style:normal;}.wp-block-kadence-advancedheading.kt-adv-heading510545_f7c5f8-ed mark.kt-highlight, .wp-block-kadence-advancedheading.kt-adv-heading510545_f7c5f8-ed[data-kb-block=\"kb-adv-heading510545_f7c5f8-ed\"] mark.kt-highlight{font-style:normal;color:#f76a0c;-webkit-box-decoration-break:clone;box-decoration-break:clone;padding-top:0px;padding-right:0px;padding-bottom:0px;padding-left:0px;}.wp-block-kadence-advancedheading.kt-adv-heading510545_f7c5f8-ed img.kb-inline-image, .wp-block-kadence-advancedheading.kt-adv-heading510545_f7c5f8-ed[data-kb-block=\"kb-adv-heading510545_f7c5f8-ed\"] img.kb-inline-image{width:150px;vertical-align:baseline;}<\/style>\n<p class=\"kt-adv-heading510545_f7c5f8-ed wp-block-kadence-advancedheading\" data-kb-block=\"kb-adv-heading510545_f7c5f8-ed\">Quick Links:&nbsp;<a href=\"https:\/\/jorgep.com\/blog\/resources-for-learning-ai\/\">Resources for Learning AI<\/a> | <a href=\"https:\/\/jorgep.com\/blog\/keeping-up-with-ai\/\">Keep up with AI<\/a> | <a href=\"https:\/\/jorgep.com\/blog\/list-of-ai-tools\/\" data-type=\"post\" data-id=\"402818\">List of AI Tools<\/a> | <a href=\"https:\/\/jorgep.com\/blog\/local-ai-series\/\" data-type=\"page\" data-id=\"519365\">Local AI<\/a> | <a href=\"https:\/\/jorgep.com\/blog\/tag\/ai-agents\/\" data-type=\"post_tag\" data-id=\"941\">AI Agents<\/a> |  <a href=\"https:\/\/jorgep.com\/blog\/work-beyond-tomorrow-series\/\" data-type=\"page\" data-id=\"365001\">Future of Work<\/a><\/p>\n<\/div><\/div>\n<\/div><\/div>\n\n\n<style>.kb-row-layout-id395113_d73e95-0d > .kt-row-column-wrap{align-content:start;}:where(.kb-row-layout-id395113_d73e95-0d > .kt-row-column-wrap) > .wp-block-kadence-column{justify-content:start;}.kb-row-layout-id395113_d73e95-0d > .kt-row-column-wrap{column-gap:var(--global-kb-gap-md, 2rem);row-gap:var(--global-kb-gap-none, 0rem );padding-top:var(--global-kb-spacing-xxs, 0.5rem);padding-bottom:var(--global-kb-spacing-xxs, 0.5rem);grid-template-columns:repeat(2, minmax(0, 1fr));}.kb-row-layout-id395113_d73e95-0d > .kt-row-layout-overlay{opacity:0.30;}@media all and (max-width: 1024px){.kb-row-layout-id395113_d73e95-0d > .kt-row-column-wrap{grid-template-columns:repeat(2, minmax(0, 1fr));}}@media all and (max-width: 767px){.kb-row-layout-id395113_d73e95-0d > .kt-row-column-wrap{grid-template-columns:minmax(0, 1fr);}}<\/style><div class=\"kb-row-layout-wrap kb-row-layout-id395113_d73e95-0d alignnone wp-block-kadence-rowlayout\"><div class=\"kt-row-column-wrap kt-has-2-columns kt-row-layout-equal kt-tab-layout-inherit kt-mobile-layout-row kt-row-valign-top\">\n<style>.kadence-column395113_df36f9-de > .kt-inside-inner-col,.kadence-column395113_df36f9-de > .kt-inside-inner-col:before{border-top-left-radius:0px;border-top-right-radius:0px;border-bottom-right-radius:0px;border-bottom-left-radius:0px;}.kadence-column395113_df36f9-de > .kt-inside-inner-col{column-gap:var(--global-kb-gap-sm, 1rem);}.kadence-column395113_df36f9-de > .kt-inside-inner-col{flex-direction:column;}.kadence-column395113_df36f9-de > .kt-inside-inner-col > .aligncenter{width:100%;}.kadence-column395113_df36f9-de > .kt-inside-inner-col:before{opacity:0.3;}.kadence-column395113_df36f9-de{position:relative;}@media all and (max-width: 1024px){.kadence-column395113_df36f9-de > .kt-inside-inner-col{flex-direction:column;justify-content:center;}}@media all and (max-width: 767px){.kadence-column395113_df36f9-de > .kt-inside-inner-col{flex-direction:column;justify-content:center;}}<\/style>\n<div class=\"wp-block-kadence-column kadence-column395113_df36f9-de\"><div class=\"kt-inside-inner-col\"><style>.wp-block-kadence-advancedheading.kt-adv-heading395113_b3212c-b9, .wp-block-kadence-advancedheading.kt-adv-heading395113_b3212c-b9[data-kb-block=\"kb-adv-heading395113_b3212c-b9\"]{text-align:center;font-size:var(--global-kb-font-size-sm, 0.9rem);line-height:60px;font-style:normal;background-color:#f5a511;}.wp-block-kadence-advancedheading.kt-adv-heading395113_b3212c-b9 mark.kt-highlight, .wp-block-kadence-advancedheading.kt-adv-heading395113_b3212c-b9[data-kb-block=\"kb-adv-heading395113_b3212c-b9\"] mark.kt-highlight{font-style:normal;color:#f76a0c;-webkit-box-decoration-break:clone;box-decoration-break:clone;padding-top:0px;padding-right:0px;padding-bottom:0px;padding-left:0px;}.wp-block-kadence-advancedheading.kt-adv-heading395113_b3212c-b9 img.kb-inline-image, .wp-block-kadence-advancedheading.kt-adv-heading395113_b3212c-b9[data-kb-block=\"kb-adv-heading395113_b3212c-b9\"] img.kb-inline-image{width:150px;vertical-align:baseline;}<\/style>\n<p class=\"kt-adv-heading395113_b3212c-b9 wp-block-kadence-advancedheading\" data-kb-block=\"kb-adv-heading395113_b3212c-b9\">Subscribe to <a href=\"https:\/\/go.35s.be\/jtb\" target=\"_blank\" rel=\"noreferrer noopener\"><strong>JorgeTechBits  newsletter<\/strong><\/a><\/p>\n<\/div><\/div>\n\n\n<style>.kadence-column395113_4b4b81-29 > .kt-inside-inner-col,.kadence-column395113_4b4b81-29 > .kt-inside-inner-col:before{border-top-left-radius:0px;border-top-right-radius:0px;border-bottom-right-radius:0px;border-bottom-left-radius:0px;}.kadence-column395113_4b4b81-29 > .kt-inside-inner-col{column-gap:var(--global-kb-gap-sm, 1rem);}.kadence-column395113_4b4b81-29 > .kt-inside-inner-col{flex-direction:column;}.kadence-column395113_4b4b81-29 > .kt-inside-inner-col > .aligncenter{width:100%;}.kadence-column395113_4b4b81-29 > .kt-inside-inner-col:before{opacity:0.3;}.kadence-column395113_4b4b81-29{position:relative;}@media all and (max-width: 1024px){.kadence-column395113_4b4b81-29 > .kt-inside-inner-col{flex-direction:column;justify-content:center;}}@media all and (max-width: 767px){.kadence-column395113_4b4b81-29 > .kt-inside-inner-col{flex-direction:column;justify-content:center;}}<\/style>\n<div class=\"wp-block-kadence-column kadence-column395113_4b4b81-29\"><div class=\"kt-inside-inner-col\"><\/div><\/div>\n\n<\/div><\/div><\/div>\n\n\n\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\"><div class=\"wp-block-image\">\n<figure class=\"aligncenter size-large is-resized\"><a href=\"htthttps:\/\/jorgep.com\/blog\/book-dont-just-chat-delegate\/\"><img loading=\"lazy\" decoding=\"async\" width=\"640\" height=\"1024\" src=\"https:\/\/jorgep.com\/blog\/wp-content\/uploads\/CoverBook-01-640x1024.jpg\" alt=\"\" class=\"wp-image-520234\" style=\"aspect-ratio:0.6250142320391666;width:98px;height:auto\" srcset=\"https:\/\/jorgep.com\/blog\/wp-content\/uploads\/CoverBook-01-640x1024.jpg 640w, https:\/\/jorgep.com\/blog\/wp-content\/uploads\/CoverBook-01-188x300.jpg 188w, https:\/\/jorgep.com\/blog\/wp-content\/uploads\/CoverBook-01-768x1229.jpg 768w, https:\/\/jorgep.com\/blog\/wp-content\/uploads\/CoverBook-01-960x1536.jpg 960w, https:\/\/jorgep.com\/blog\/wp-content\/uploads\/CoverBook-01-1280x2048.jpg 1280w, https:\/\/jorgep.com\/blog\/wp-content\/uploads\/CoverBook-01.jpg 1600w\" sizes=\"auto, (max-width: 640px) 100vw, 640px\" \/><\/a><figcaption class=\"wp-element-caption\"><a href=\"https:\/\/jorgep.com\/blog\/book-series-ai-dont-just-chat\/\" data-type=\"page\" data-id=\"520242\">Check out the Book Series<\/a><\/figcaption><\/figure>\n<\/div><\/div>\n<\/div>\n\n\n<style>.wp-block-kadence-advancedheading.kt-adv-heading519190_b33a00-c9, .wp-block-kadence-advancedheading.kt-adv-heading519190_b33a00-c9[data-kb-block=\"kb-adv-heading519190_b33a00-c9\"]{font-size:var(--global-kb-font-size-sm, 0.9rem);font-style:normal;}.wp-block-kadence-advancedheading.kt-adv-heading519190_b33a00-c9 mark.kt-highlight, .wp-block-kadence-advancedheading.kt-adv-heading519190_b33a00-c9[data-kb-block=\"kb-adv-heading519190_b33a00-c9\"] mark.kt-highlight{font-style:normal;color:#f76a0c;-webkit-box-decoration-break:clone;box-decoration-break:clone;padding-top:0px;padding-right:0px;padding-bottom:0px;padding-left:0px;}.wp-block-kadence-advancedheading.kt-adv-heading519190_b33a00-c9 img.kb-inline-image, .wp-block-kadence-advancedheading.kt-adv-heading519190_b33a00-c9[data-kb-block=\"kb-adv-heading519190_b33a00-c9\"] img.kb-inline-image{width:150px;vertical-align:baseline;}<\/style>\n<p class=\"kt-adv-heading519190_b33a00-c9 wp-block-kadence-advancedheading\" data-kb-block=\"kb-adv-heading519190_b33a00-c9\"><strong>Disclaimer:<\/strong> <strong>I create this content entirely on my own time, and the views expressed here are mine alone (not my employer&#8217;s)<\/strong>. Because I love leveraging new tech, I use AI tools like Gemini, NotebookLM, Claude, Perplexity and others as a &#8220;digital team&#8221; to help research and polish these articles so I can share the best possible insights with you!<\/p>\n\n\n<style>.wp-block-kadence-advancedheading.kt-adv-heading407818_afcbba-c7, .wp-block-kadence-advancedheading.kt-adv-heading407818_afcbba-c7[data-kb-block=\"kb-adv-heading407818_afcbba-c7\"]{font-size:var(--global-kb-font-size-sm, 0.9rem);font-style:normal;}.wp-block-kadence-advancedheading.kt-adv-heading407818_afcbba-c7 mark.kt-highlight, .wp-block-kadence-advancedheading.kt-adv-heading407818_afcbba-c7[data-kb-block=\"kb-adv-heading407818_afcbba-c7\"] mark.kt-highlight{font-style:normal;color:#f76a0c;-webkit-box-decoration-break:clone;box-decoration-break:clone;padding-top:0px;padding-right:0px;padding-bottom:0px;padding-left:0px;}.wp-block-kadence-advancedheading.kt-adv-heading407818_afcbba-c7 img.kb-inline-image, .wp-block-kadence-advancedheading.kt-adv-heading407818_afcbba-c7[data-kb-block=\"kb-adv-heading407818_afcbba-c7\"] img.kb-inline-image{width:150px;vertical-align:baseline;}<\/style>\n<p class=\"kt-adv-heading407818_afcbba-c7 wp-block-kadence-advancedheading\" data-kb-block=\"kb-adv-heading407818_afcbba-c7\">Have questions, ideas to share, or just want to connect? I\u2019d love to hear from you! Check out my <a href=\"https:\/\/jorgep.com\/blog\/about\/\">About Page<\/a> to learn more about me or connect with me.<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">The narrative around artificial intelligence has long been dominated by bigger equals better. Models with trillions of parameters, trained on internet-scale datasets, powered by massive GPU clusters\u2014these were the benchmarks of progress. But something interesting is happening at the other end of the spectrum. Small models\u2014measured in billions, not trillions of parameters\u2014are proving they can deliver remarkably capable intelligence at a fraction of the computational cost.<a rel=\"noreferrer noopener\" target=\"_blank\" href=\"https:\/\/dev.to\/arshtechpro\/gemma-4-a-practical-guide-for-developers-2co5\"><\/a><\/p>\n\n\n\n<p class=\"wp-block-paragraph\">This shift matters for practical reasons. Not every task requires the reasoning depth of a frontier model. Not every developer has access to enterprise GPU clusters. Not every use case can tolerate cloud latency or API costs that scale with usage. Small local models run on consumer hardware, work offline, keep data private, and cost pennies to operate.<\/p>\n\n\n\n<p class=\"wp-block-paragraph\"><a href=\"https:\/\/www.latent.space\/p\/ainews-top-local-models-list-april\" target=\"_blank\" rel=\"noreferrer noopener\"><\/a><\/p>\n\n\n\n<p class=\"wp-block-paragraph\">This guide examines four leading small models available today: Google&#8217;s Gemma 4, NVIDIA&#8217;s Nemotron Nano 3 Omni, Alibaba&#8217;s Qwen 3.5B, and Mistral Small. Each represents a different philosophy about what small can achieve\u2014and where the trade-offs lie.<\/p>\n\n\n\n<p class=\"wp-block-paragraph\"><\/p>\n\n\n\n<h2 class=\"wp-block-heading\" id=\"why-small-models-matter-now\">Why Small Models Matter Now<\/h2>\n\n\n\n<p class=\"wp-block-paragraph\"><\/p>\n\n\n\n<p class=\"wp-block-paragraph\">Before diving into the models themselves, it is worth understanding why small local models have moved from experimental curiosity to viable production option.<\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li><strong>The economics are undeniable. <\/strong>Small local models, amortized across hardware purchases, cost fractions of a cent per million tokens. For high-volume applications\u2014customer service automation, document processing, code completion\u2014the savings compound quickly.<a href=\"https:\/\/www.latent.space\/p\/ainews-top-local-models-list-april\" target=\"_blank\" rel=\"noreferrer noopener\"><\/a><\/li>\n\n\n\n<li><strong>Privacy is another driver.<\/strong> When models run locally, data never leaves the machine. This matters for healthcare, legal, financial services, and any regulated industry. It also matters for developers who simply do not want to ship proprietary code or personal documents to third-party APIs.<a href=\"https:\/\/www.perplexity.ai\/search\/9598a2cb-43a7-4d01-9cbd-7bed9b7150bf\" target=\"_blank\" rel=\"noreferrer noopener\"><\/a><\/li>\n\n\n\n<li><strong>Latency matters to<\/strong>o. A local model responding in 50 milliseconds beats a cloud model at 500 milliseconds, regardless of raw capability. For interactive applications like voice assistants, real-time coding companions, or embedded devices, local inference is the only practical option.<\/li>\n\n\n\n<li><strong>Finally, there is resilience.<\/strong> API services go down. Rate limits kick in. Model versions change behavior without warning. Local models offer stability and control that cloud APIs cannot match.<\/li>\n<\/ul>\n\n\n\n<h2 class=\"wp-block-heading\" id=\"google-gemma-4-the-practical-workhorse\">Google Gemma 4: The Practical Workhorse<\/h2>\n\n\n\n<p class=\"wp-block-paragraph\">Google&#8217;s Gemma 4 family represents a refinement of what the company learned building Gemini at scale, packaged for developers who need something deployable on ordinary hardware. The family comes in sizes from 2B to 27B+ parameters (April 2026 release under Apache 2.0).<\/p>\n\n\n\n<h2 class=\"wp-block-heading\" id=\"technical-specifications\">Technical Specifications<\/h2>\n\n\n\n<figure class=\"wp-block-table\"><table><thead><tr><th class=\"has-text-align-left\" data-align=\"left\">Variant<\/th><th class=\"has-text-align-left\" data-align=\"left\">Parameters<\/th><th class=\"has-text-align-left\" data-align=\"left\">Context Window<\/th><th class=\"has-text-align-left\" data-align=\"left\">Release Date<\/th><\/tr><\/thead><tbody><tr><td>Gemma 4 2B<\/td><td>2 billion<\/td><td>128K<\/td><td>April 2026&nbsp;<a rel=\"noreferrer noopener\" target=\"_blank\" href=\"https:\/\/dev.to\/arshtechpro\/gemma-4-a-practical-guide-for-developers-2co5\"><\/a><\/td><\/tr><tr><td>Gemma 4 4B<\/td><td>4 billion<\/td><td>128K<\/td><td>April 2026&nbsp;<a rel=\"noreferrer noopener\" target=\"_blank\" href=\"https:\/\/dev.to\/arshtechpro\/gemma-4-a-practical-guide-for-developers-2co5\"><\/a><\/td><\/tr><tr><td>Gemma 4 27B<\/td><td>27 billion<\/td><td>128K<\/td><td>April 2026&nbsp;<a rel=\"noreferrer noopener\" target=\"_blank\" href=\"https:\/\/ai.google.dev\/gemma\/docs\/core\"><\/a><\/td><\/tr><\/tbody><\/table><\/figure>\n\n\n\n<p class=\"wp-block-paragraph\">The 128,000-token context window is noteworthy for a small model family. This matches top competitors, making Gemma suitable for document analysis, long-form summarization, and retrieval-augmented generation where context matters.<a rel=\"noreferrer noopener\" target=\"_blank\" href=\"https:\/\/ai.google.dev\/gemma\/docs\/core\"><\/a><\/p>\n\n\n\n<h2 class=\"wp-block-heading\" id=\"hardware-requirements\">Hardware Requirements<\/h2>\n\n\n\n<figure class=\"wp-block-table\"><table><thead><tr><th class=\"has-text-align-left\" data-align=\"left\">Precision<\/th><th class=\"has-text-align-left\" data-align=\"left\">4B VRAM<\/th><th class=\"has-text-align-left\" data-align=\"left\">12B VRAM<\/th><th class=\"has-text-align-left\" data-align=\"left\">27B VRAM<\/th><\/tr><\/thead><tbody><tr><td>4-bit (Q4_K_M)<\/td><td>3-4 GB<\/td><td>8-9 GB<\/td><td>16-18 GB<\/td><\/tr><tr><td>8-bit (Q8_0)<\/td><td>5-6 GB<\/td><td>14-16 GB<\/td><td>28-32 GB<\/td><\/tr><tr><td>FP16<\/td><td>8 GB<\/td><td>24 GB<\/td><td>54 GB<\/td><\/tr><\/tbody><\/table><\/figure>\n\n\n\n<p class=\"wp-block-paragraph\">The 4B variant running at 4-bit quantization fits comfortably on a laptop with 8GB RAM and integrated graphics, though a dedicated GPU is recommended for acceptable speeds.<a rel=\"noreferrer noopener\" target=\"_blank\" href=\"https:\/\/dev.to\/arshtechpro\/gemma-4-a-practical-guide-for-developers-2co5\"><\/a><\/p>\n\n\n\n<h2 class=\"wp-block-heading\" id=\"benchmarks-and-performance\">Benchmarks and Performance<\/h2>\n\n\n\n<p class=\"wp-block-paragraph\">Early evaluations place Gemma 4 4B at approximately 75-78% of Claude Sonnet&#8217;s performance on coding tasks (HumanEval), and competitive on reading comprehension (RACE, DROP). The larger variants narrow this gap significantly on standard benchmarks.<a rel=\"noreferrer noopener\" target=\"_blank\" href=\"https:\/\/dev.to\/arshtechpro\/gemma-4-a-practical-guide-for-developers-2co5\"><\/a><\/p>\n\n\n\n<p class=\"wp-block-paragraph\">Where Gemma 4 excels is instruction following. Google&#8217;s training methodology emphasizes alignment with human intent, making these models reliable for chatbot applications and assistant-style workflows.<\/p>\n\n\n\n<h2 class=\"wp-block-heading\" id=\"when-to-choose-gemma-4\">When to Choose Gemma 4<\/h2>\n\n\n\n<ul class=\"wp-block-list\">\n<li>You need a balance of capability and efficiency<\/li>\n\n\n\n<li>Long context windows are essential<\/li>\n\n\n\n<li>You are already in the Google ecosystem (Vertex AI, GCP)<\/li>\n\n\n\n<li>You want predictable, low-risk deployment<\/li>\n<\/ul>\n\n\n\n<h2 class=\"wp-block-heading\" id=\"nvidia-nemotron-nano-3-omni-the-efficiency-champio\">NVIDIA Nemotron Nano 3 Omni: The Efficiency Champion<\/h2>\n\n\n\n<p class=\"wp-block-paragraph\">NVIDIA&#8217;s entry into small models brings something unique: hardware-software co-design. Nemotron Nano 3 Omni was designed specifically to maximize throughput on NVIDIA consumer GPUs while maintaining quality competitive with larger models (April 2026).<\/p>\n\n\n\n<h2 class=\"wp-block-heading\" id=\"technical-specifications\">Technical Specifications<\/h2>\n\n\n\n<figure class=\"wp-block-table\"><table><thead><tr><th class=\"has-text-align-left\" data-align=\"left\">Attribute<\/th><th class=\"has-text-align-left\" data-align=\"left\">Nemotron Nano 3 Omni<\/th><\/tr><\/thead><tbody><tr><td>Parameters<\/td><td>3 billion<\/td><\/tr><tr><td>Context Window<\/td><td>128K<\/td><\/tr><tr><td>Training Tokens<\/td><td>4 trillion<\/td><\/tr><tr><td>Architecture<\/td><td>Transformer with grouped-query attention<\/td><\/tr><tr><td>Specialization<\/td><td>Tool use, function calling, reasoning&nbsp;<a rel=\"noreferrer noopener\" target=\"_blank\" href=\"https:\/\/huggingface.co\/blog\/nvidia\/nemotron-3-nano-omni-multimodal-intelligence\"><\/a><\/td><\/tr><\/tbody><\/table><\/figure>\n\n\n\n<p class=\"wp-block-paragraph\">The 3 billion parameter count is deliberately aggressive. NVIDIA&#8217;s thesis is that architecture and training efficiency matter as much as raw scale.<\/p>\n\n\n\n<h2 class=\"wp-block-heading\" id=\"hardware-requirements\">Hardware Requirements<\/h2>\n\n\n\n<figure class=\"wp-block-table\"><table><thead><tr><th class=\"has-text-align-left\" data-align=\"left\">Precision<\/th><th class=\"has-text-align-left\" data-align=\"left\">VRAM Required<\/th><th class=\"has-text-align-left\" data-align=\"left\">Tokens\/Second (RTX 4090)<\/th><\/tr><\/thead><tbody><tr><td>FP8<\/td><td>6 GB<\/td><td>~120 t\/s<\/td><\/tr><tr><td>INT4<\/td><td>4 GB<\/td><td>~200 t\/s<\/td><\/tr><\/tbody><\/table><\/figure>\n\n\n\n<p class=\"wp-block-paragraph\">INT4 quantization on an RTX 4090 achieves roughly 200 tokens per second\u2014fast enough for real-time applications.<a rel=\"noreferrer noopener\" target=\"_blank\" href=\"https:\/\/developer.nvidia.com\/blog\/nvidia-nemotron-3-nano-omni-powers-multimodal-agent-reasoning-in-a-single-efficient-open-model\/\"><\/a><\/p>\n\n\n\n<p class=\"wp-block-paragraph\"><strong>What Makes Nemotron Different<\/strong><br>Nemotron Nano 3 Omni was trained with unusual emphasis on tool use and structured outputs. The model shows particular strength in:<\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li>Function calling accuracy (94% on the BFCL benchmark)<\/li>\n\n\n\n<li>JSON generation without syntax errors<\/li>\n\n\n\n<li>Multi-step reasoning with intermediate steps<\/li>\n\n\n\n<li>Following complex, multi-part instructions<\/li>\n<\/ul>\n\n\n\n<p class=\"wp-block-paragraph\">This makes Nemotron especially suitable for agents, automation workflows, and applications that need structured data extraction.<a rel=\"noreferrer noopener\" target=\"_blank\" href=\"https:\/\/huggingface.co\/blog\/nvidia\/nemotron-3-nano-omni-multimodal-intelligence\"><\/a><\/p>\n\n\n\n<h2 class=\"wp-block-heading\" id=\"when-to-choose-nemotron-nano-3-omni\">When to Choose Nemotron Nano 3 Omni<\/h2>\n\n\n\n<ul class=\"wp-block-list\">\n<li>You have NVIDIA hardware (optimal performance)<\/li>\n\n\n\n<li>Tool use and function calling are critical<\/li>\n\n\n\n<li>You need maximum tokens-per-dollar efficiency<\/li>\n\n\n\n<li>Your application requires structured outputs<\/li>\n<\/ul>\n\n\n\n<h2 class=\"wp-block-heading\" id=\"alibaba-qwen-35b-the-multilingual-surprise\">Alibaba Qwen 3.5B: The Multilingual Surprise<\/h2>\n\n\n\n<p class=\"wp-block-paragraph\">Alibaba&#8217;s Qwen series has gained significant traction in open-source circles, and Qwen 3.5B punches well above its weight class. Where most small models optimize for English, Qwen was trained from the start on multilingual data covering English, Chinese, Japanese, Korean, and major European languages.<a rel=\"noreferrer noopener\" target=\"_blank\" href=\"https:\/\/venturebeat.com\/technology\/alibabas-small-open-source-qwen3-5-9b-beats-openais-gpt-oss-120b-and-can-run\"><\/a><\/p>\n\n\n\n<h2 class=\"wp-block-heading\" id=\"technical-specifications\">Technical Specifications<\/h2>\n\n\n\n<figure class=\"wp-block-table\"><table><thead><tr><th class=\"has-text-align-left\" data-align=\"left\">Attribute<\/th><th class=\"has-text-align-left\" data-align=\"left\">Qwen 3.5B<\/th><\/tr><\/thead><tbody><tr><td>Parameters<\/td><td>3.5 billion<\/td><\/tr><tr><td>Context Window<\/td><td>32K (standard) \/ 128K (extended)<\/td><\/tr><tr><td>Training Tokens<\/td><td>3.6 trillion<\/td><\/tr><tr><td>Languages<\/td><td>29 languages with strong capability<\/td><\/tr><tr><td>License<\/td><td>Apache 2.0 (fully open)&nbsp;<a rel=\"noreferrer noopener\" target=\"_blank\" href=\"https:\/\/apxml.com\/models\/qwen35-35b-a3b\"><\/a><\/td><\/tr><\/tbody><\/table><\/figure>\n\n\n\n<p class=\"wp-block-paragraph\">The Apache 2.0 license matters. Qwen can be used commercially without limitation, modified, and redistributed.<\/p>\n\n\n\n<h2 class=\"wp-block-heading\" id=\"hardware-requirements\">Hardware Requirements<\/h2>\n\n\n\n<figure class=\"wp-block-table\"><table><thead><tr><th class=\"has-text-align-left\" data-align=\"left\">Precision<\/th><th class=\"has-text-align-left\" data-align=\"left\">VRAM Needed<\/th><th class=\"has-text-align-left\" data-align=\"left\">Notes<\/th><\/tr><\/thead><tbody><tr><td>Q4_0<\/td><td>2.5 GB<\/td><td>Minimum viable<\/td><\/tr><tr><td>Q8_0<\/td><td>4.5 GB<\/td><td>Recommended<\/td><\/tr><tr><td>FP16<\/td><td>7 GB<\/td><td>Best quality<\/td><\/tr><\/tbody><\/table><\/figure>\n\n\n\n<p class=\"wp-block-paragraph\">Qwen 3.5B is remarkably efficient. The 4-bit quantized version runs on an 8GB consumer GPU with room to spare.<a rel=\"noreferrer noopener\" target=\"_blank\" href=\"https:\/\/apxml.com\/models\/qwen35-35b-a3b\"><\/a><\/p>\n\n\n\n<p class=\"wp-block-paragraph\"><strong>Multilingual Capability<\/strong><br>This is Qwen&#8217;s distinguishing feature. On the multilingual MMLU benchmark:<\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li>Qwen 3.5B: 72.4% (average across languages)<\/li>\n\n\n\n<li>Gemma 4 4B: 61.2%<\/li>\n\n\n\n<li>Nemotron 3B: 58.7%<\/li>\n\n\n\n<li>Mistral Small: ~65%<a href=\"https:\/\/venturebeat.com\/technology\/alibabas-small-open-source-qwen3-5-9b-beats-openais-gpt-oss-120b-and-can-run\" target=\"_blank\" rel=\"noreferrer noopener\"><\/a><\/li>\n<\/ul>\n\n\n\n<p class=\"wp-block-paragraph\">For applications targeting global markets, or Chinese-language use cases specifically, Qwen has a clear advantage.<\/p>\n\n\n\n<h2 class=\"wp-block-heading\" id=\"when-to-choose-qwen-35b\">When to Choose Qwen 3.5B<\/h2>\n\n\n\n<ul class=\"wp-block-list\">\n<li>You need strong non-English capability<\/li>\n\n\n\n<li>Open licensing matters for your use case<\/li>\n\n\n\n<li>You are working with mixed-language content<\/li>\n\n\n\n<li>You want the smallest viable footprint<\/li>\n<\/ul>\n\n\n\n<h2 class=\"wp-block-heading\" id=\"mistral-small-the-reasoning-specialist\">Mistral Small: The Reasoning Specialist<\/h2>\n\n\n\n<p class=\"wp-block-paragraph\">French AI company Mistral has built a reputation for efficient, capable models. Mistral Small (~24B effective parameters, sliding window attention) brings this philosophy to the edge deployment space.<\/p>\n\n\n\n<h2 class=\"wp-block-heading\" id=\"technical-specifications\">Technical Specifications<\/h2>\n\n\n\n<figure class=\"wp-block-table\"><table><thead><tr><th class=\"has-text-align-left\" data-align=\"left\">Attribute<\/th><th class=\"has-text-align-left\" data-align=\"left\">Mistral Small<\/th><\/tr><\/thead><tbody><tr><td>Parameters<\/td><td>~24B effective<\/td><\/tr><tr><td>Context Window<\/td><td>32K<\/td><\/tr><tr><td>Architecture<\/td><td>Sliding window attention<\/td><\/tr><tr><td>Specialization<\/td><td>Reasoning, coding, instruction following&nbsp;<a rel=\"noreferrer noopener\" target=\"_blank\" href=\"https:\/\/n8n.io\/ai-benchmark\/mistral-small-24b-instruct-2501\/\"><\/a><\/td><\/tr><\/tbody><\/table><\/figure>\n\n\n\n<p class=\"wp-block-paragraph\">Mistral uses grouped-query attention and sliding window attention to maintain performance with smaller parameter counts. These techniques reduce memory access patterns, improving speed on consumer hardware.<\/p>\n\n\n\n<h2 class=\"wp-block-heading\" id=\"hardware-requirements\">Hardware Requirements<\/h2>\n\n\n\n<figure class=\"wp-block-table\"><table><thead><tr><th class=\"has-text-align-left\" data-align=\"left\">Precision<\/th><th class=\"has-text-align-left\" data-align=\"left\">VRAM Needed<\/th><th class=\"has-text-align-left\" data-align=\"left\">Speed (Mac M3)<\/th><\/tr><\/thead><tbody><tr><td>Q4_K_M<\/td><td>2.8 GB<\/td><td>~45 t\/s<\/td><\/tr><tr><td>Q8_0<\/td><td>5.2 GB<\/td><td>~28 t\/s<\/td><\/tr><tr><td>FP16<\/td><td>7.5 GB<\/td><td>~12 t\/s<\/td><\/tr><\/tbody><\/table><\/figure>\n\n\n\n<p class=\"wp-block-paragraph\">Mistral Small achieves competitive speeds even on Apple Silicon, where many models struggle with memory bandwidth limitations.<a rel=\"noreferrer noopener\" target=\"_blank\" href=\"https:\/\/n8n.io\/ai-benchmark\/mistral-small-24b-instruct-2501\/\"><\/a><\/p>\n\n\n\n<p class=\"wp-block-paragraph\"><strong>Reasoning and Coding Strength<\/strong><br>On coding benchmarks (HumanEval, MBPP), Mistral Small outperforms comparably-sized models:<\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li>HumanEval pass@1: 71.2%<\/li>\n\n\n\n<li>Qwen 3.5B: 68.4%<\/li>\n\n\n\n<li>Gemma 4 4B: 66.8%<\/li>\n\n\n\n<li>Nemotron 3B: 64.1%<a href=\"https:\/\/n8n.io\/ai-benchmark\/mistral-small-24b-instruct-2501\/\" target=\"_blank\" rel=\"noreferrer noopener\"><\/a><\/li>\n<\/ul>\n\n\n\n<p class=\"wp-block-paragraph\">The model also shows strong performance on mathematical reasoning (GSM8K: 72.1%), suggesting robust chain-of-thought capability despite its size.<\/p>\n\n\n\n<h2 class=\"wp-block-heading\" id=\"when-to-choose-mistral-small\">When to Choose Mistral Small<\/h2>\n\n\n\n<ul class=\"wp-block-list\">\n<li>Coding assistance is a primary use case<\/li>\n\n\n\n<li>You value reasoning ability over raw knowledge<\/li>\n\n\n\n<li>You are deploying on Apple Silicon (M-series chips)<\/li>\n\n\n\n<li>You want European-developed alternatives<\/li>\n<\/ul>\n\n\n\n<h2 class=\"wp-block-heading\" id=\"head-to-head-comparison\">Head-to-Head Comparison<\/h2>\n\n\n\n<figure class=\"wp-block-table\"><table><thead><tr><th class=\"has-text-align-left\" data-align=\"left\">Metric<\/th><th class=\"has-text-align-left\" data-align=\"left\">Gemma 4 4B<\/th><th class=\"has-text-align-left\" data-align=\"left\">Nemotron 3B<\/th><th class=\"has-text-align-left\" data-align=\"left\">Qwen 3.5B<\/th><th class=\"has-text-align-left\" data-align=\"left\">Mistral Small<\/th><\/tr><\/thead><tbody><tr><td>Parameters<\/td><td>4B<\/td><td>3B<\/td><td>3.5B<\/td><td>~24B<\/td><\/tr><tr><td>Context Window<\/td><td>128K<\/td><td>128K<\/td><td>32K\/128K<\/td><td>32K<\/td><\/tr><tr><td>Min GPU (4-bit)<\/td><td>3-4 GB<\/td><td>4 GB<\/td><td>2.5 GB<\/td><td>2.8 GB<\/td><\/tr><tr><td>English Quality<\/td><td>High<\/td><td>High<\/td><td>High<\/td><td>High<\/td><\/tr><tr><td>Multilingual<\/td><td>Basic<\/td><td>Basic<\/td><td>Excellent<\/td><td>Good<\/td><\/tr><tr><td>Coding<\/td><td>Good<\/td><td>Good<\/td><td>Good<\/td><td>Excellent<\/td><\/tr><tr><td>Tool Use<\/td><td>Good<\/td><td>Excellent<\/td><td>Basic<\/td><td>Good<\/td><\/tr><tr><td>License<\/td><td><strong>Apache 2.0<\/strong><\/td><td>Commercial<\/td><td>Apache 2.0<\/td><td>Apache 2.0<\/td><\/tr><tr><td>Tokens\/Second (RTX 4090, 4-bit)<\/td><td>~85<\/td><td>~200<\/td><td>~95<\/td><td>~110&nbsp;<a rel=\"noreferrer noopener\" target=\"_blank\" href=\"https:\/\/www.latent.space\/p\/ainews-top-local-models-list-april\"><\/a><\/td><\/tr><\/tbody><\/table><\/figure>\n\n\n\n<h2 class=\"wp-block-heading\" id=\"choosing-the-right-model-for-your-application\">Choosing the Right Model for Your Application<\/h2>\n\n\n\n<p class=\"wp-block-paragraph\">Selecting a small local model requires matching capabilities to requirements. Here is a decision framework:<\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li><strong>For document processing and RAG<\/strong>: Gemma 4&#8217;s 128K context window is decisive. Nothing else in this comparison matches its ability to ingest entire documents without chunking.<\/li>\n\n\n\n<li><strong>For agent and automation workflows<\/strong>: Nemotron Nano 3 Omni&#8217;s tool-use proficiency and throughput efficiency make it the logical choice. The INT4 performance on RTX hardware is unmatched.<\/li>\n\n\n\n<li><strong>For global applications<\/strong>: Qwen 3.5B&#8217;s multilingual capability and open licensing provide clear advantages, particularly for Asian language support.<\/li>\n\n\n\n<li><strong>For coding and development<\/strong>: Mistral Small&#8217;s reasoning benchmarks and Apple Silicon optimization make it ideal for IDE integrations and developer tools.<a href=\"https:\/\/www.latent.space\/p\/ainews-top-local-models-list-april\" target=\"_blank\" rel=\"noreferrer noopener\"><\/a><\/li>\n<\/ul>\n\n\n\n<h2 class=\"wp-block-heading\" id=\"deployment-patterns-that-work\">Deployment Patterns That Work<\/h2>\n\n\n\n<p class=\"wp-block-paragraph\">Small local models unlock deployment patterns that are impractical with cloud APIs.<\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li><strong>Edge deployment on consumer hardware<\/strong>: A $1,500 laptop with an RTX 4060 can run any of these models with acceptable speed. This enables offline-first applications, field work without connectivity, and privacy-sensitive workflows.<\/li>\n\n\n\n<li><strong>Embedded and IoT<\/strong>: With quantization and optimization, Nemotron Nano 3 Omni has been demonstrated running on the NVIDIA Jetson series, enabling AI on devices with 8GB shared memory.<\/li>\n\n\n\n<li><strong>Private knowledge bases<\/strong>: Companies are deploying these models on internal servers to answer questions from proprietary documents without ever exposing data to third parties.<\/li>\n\n\n\n<li><strong>Hybrid architectures<\/strong>: Many production systems now use small local models for first-pass filtering and routing, only escalating to cloud APIs when confidence is low. This reduces API costs by 70-90% while maintaining quality.<\/li>\n<\/ul>\n\n\n\n<h2 class=\"wp-block-heading\" id=\"practical-considerations\">Practical Considerations<\/h2>\n\n\n\n<p class=\"wp-block-paragraph\">Running small local models is not without friction.<\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li><strong>Quantization trade-offs<\/strong>: 4-bit models save memory and run faster but can show degradation on precision-sensitive tasks like mathematical reasoning or structured data extraction. Test carefully against your actual use case, not just benchmarks.<\/li>\n\n\n\n<li><strong>Context length limitations<\/strong>: Even 128K context windows fill quickly with long conversations or large documents. Implement proper context management\u2014summarization, sliding windows, or retrieval-augmented generation\u2014to avoid silent truncation.<\/li>\n\n\n\n<li><strong>Hardware heterogeneity<\/strong>: A model that runs well on an RTX 4090 may struggle on an M3 MacBook or older GPU. Budget time for testing across your target deployment hardware.<\/li>\n\n\n\n<li><strong>Monitoring and observability<\/strong>: Unlike managed APIs, local models require you to implement logging, rate limiting, and error handling. The operational burden shifts from vendor management to infrastructure management.<\/li>\n<\/ul>\n\n\n\n<h2 class=\"wp-block-heading\" id=\"what-about-tomorrow\">What About Tomorrow?<\/h2>\n\n\n\n<p class=\"wp-block-paragraph\">The small model space moves fast. Gemma 4 was announced in April 2026; Nemotron Nano 3 Omni followed weeks later. By the time you read this, newer variants may exist.<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">The trend is clear: models are getting smaller and more capable simultaneously. Techniques like mixture-of-experts (already present in larger models), speculative decoding, and better training data curation will continue compressing capability into fewer parameters.<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">For developers, this is excellent news. The barrier to private, offline, low-cost AI keeps falling. The models in this guide already handle 80% of common AI tasks without cloud dependency. That percentage will only grow.<\/p>\n\n\n\n<h2 class=\"wp-block-heading\" id=\"why-this-shift-is-permanent\">Why This Shift Is Permanent<\/h2>\n\n\n\n<p class=\"wp-block-paragraph\">Large frontier models will not disappear. They remain essential for tasks requiring deep reasoning, broad knowledge, or creative synthesis. But they are increasingly overkill for routine tasks.<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">The economics favor specialization. Privacy regulations will accelerate adoption. As jurisdictions tighten data residency requirements, local inference becomes not just economic but legal necessity.<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">Finally, there is resilience. The ability to operate independently of cloud providers\u2014during outages, in remote locations, or simply to control one&#8217;s own infrastructure\u2014matters more to organizations than benchmark scores.<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">Small local models are not a compromise. They are a different category of tool, optimized for different constraints. Understanding when to deploy them is as important as understanding how.<\/p>\n\n\n\n<p class=\"wp-block-paragraph\"><\/p>\n\n\n\n<p class=\"wp-block-paragraph\"><\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li><strong><a href=\"https:\/\/jorgep.com\/blog\/category\/ai\/\" target=\"_blank\" rel=\"noreferrer noopener\">Jorge&#8217;s Local AI Series<\/a><\/strong>  |  <strong><a href=\"https:\/\/jorgep.com\/blog\/book-series-ai-dont-just-chat\/\" target=\"_blank\" rel=\"noreferrer noopener\">AI Learnings Series<\/a><\/strong> <\/li>\n\n\n\n<li><strong><a href=\"https:\/\/ai.google.dev\/gemma\" target=\"_blank\" rel=\"noreferrer noopener\">Google Gemma 4 Technical Report<\/a><\/strong><\/li>\n\n\n\n<li><strong><a href=\"https:\/\/developer.nvidia.com\/nemotron\" target=\"_blank\" rel=\"noreferrer noopener\">NVIDIA Nemotron Nano 3 Omni Documentation<\/a><\/strong><\/li>\n\n\n\n<li><strong><a href=\"https:\/\/huggingface.co\/Qwen\/Qwen2.5-3B-Instruct\" target=\"_blank\" rel=\"noreferrer noopener\">Qwen 3.5B Hugging Face Repository<\/a><\/strong><\/li>\n\n\n\n<li><strong><a href=\"https:\/\/docs.mistral.ai\/models\/\" target=\"_blank\" rel=\"noreferrer noopener\">Mistral Small Model Card<\/a><\/strong><\/li>\n\n\n\n<li><strong><a href=\"https:\/\/github.com\/ggerganov\/llama.cpp\" target=\"_blank\" rel=\"noreferrer noopener\">llama.cpp (for local inference)<\/a><\/strong><\/li>\n\n\n\n<li><strong><a href=\"https:\/\/ollama.com\/\" target=\"_blank\" rel=\"noreferrer noopener\">Ollama (simplified local deployment)<\/a><\/strong><\/li>\n\n\n\n<li><strong><a href=\"https:\/\/github.com\/vllm-project\/vllm\" target=\"_blank\" rel=\"noreferrer noopener\">vLLM (throughput-optimized serving)<\/a><\/strong><\/li>\n\n\n\n<li><strong><a href=\"https:\/\/lmstudio.ai\/\" target=\"_blank\" rel=\"noreferrer noopener\">LM Studio (GUI for local models)<\/a><\/strong><\/li>\n<\/ul>\n\n\n\n<p class=\"wp-block-paragraph\"><\/p>\n","protected":false},"excerpt":{"rendered":"<p>The narrative around artificial intelligence has long been dominated by bigger equals better. Models with trillions of parameters, trained on internet-scale datasets, powered by massive GPU clusters\u2014these were the benchmarks of progress. But something interesting is happening at the other end of the spectrum. Small models\u2014measured in billions, not trillions of parameters\u2014are proving they can&#8230;<\/p>\n","protected":false},"author":2,"featured_media":427864,"comment_status":"closed","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"_kad_blocks_custom_css":"","_kad_blocks_head_custom_js":"","_kad_blocks_body_custom_js":"","_kad_blocks_footer_custom_js":"","ngg_post_thumbnail":0,"episode_type":"","audio_file":"","podmotor_file_id":"","podmotor_episode_id":"","cover_image":"","cover_image_id":"","duration":"","filesize":"","filesize_raw":"","date_recorded":"","explicit":"","block":"","itunes_episode_number":"","itunes_title":"","itunes_season_number":"","itunes_episode_type":"","_kad_post_transparent":"","_kad_post_title":"","_kad_post_layout":"","_kad_post_sidebar_id":"","_kad_post_content_style":"","_kad_post_vertical_padding":"","_kad_post_feature":"","_kad_post_feature_position":"","_kad_post_header":false,"_kad_post_footer":false,"_kad_post_classname":"","footnotes":""},"categories":[1031,441],"tags":[471,930,894,1053,876,986],"class_list":["post-520609","post","type-post","status-publish","format-standard","has-post-thumbnail","hentry","category-ai-learnings-series","category-tech-talk","tag-ai","tag-ai-series","tag-artificial-intelligence","tag-gemma4","tag-llm","tag-local-ai"],"taxonomy_info":{"category":[{"value":1031,"label":"AI Learnings Series"},{"value":441,"label":"Tech Talk"}],"post_tag":[{"value":471,"label":"AI"},{"value":930,"label":"AI Series"},{"value":894,"label":"artificial intelligence"},{"value":1053,"label":"gemma4"},{"value":876,"label":"LLM"},{"value":986,"label":"Local AI"}]},"featured_image_src_large":["https:\/\/jorgep.com\/blog\/wp-content\/uploads\/FeaturedImage-Topic-AI-1024x512.png",1024,512,true],"author_info":{"display_name":"Jorge Pereira","author_link":"https:\/\/jorgep.com\/blog\/author\/jorge\/"},"comment_info":0,"category_info":[{"term_id":1031,"name":"AI Learnings Series","slug":"ai-learnings-series","term_group":0,"term_taxonomy_id":1041,"taxonomy":"category","description":"","parent":0,"count":31,"filter":"raw","cat_ID":1031,"category_count":31,"category_description":"","cat_name":"AI Learnings Series","category_nicename":"ai-learnings-series","category_parent":0},{"term_id":441,"name":"Tech Talk","slug":"tech-talk","term_group":0,"term_taxonomy_id":451,"taxonomy":"category","description":"","parent":0,"count":727,"filter":"raw","cat_ID":441,"category_count":727,"category_description":"","cat_name":"Tech Talk","category_nicename":"tech-talk","category_parent":0}],"tag_info":[{"term_id":471,"name":"AI","slug":"ai","term_group":0,"term_taxonomy_id":481,"taxonomy":"post_tag","description":"","parent":0,"count":182,"filter":"raw"},{"term_id":930,"name":"AI Series","slug":"ai-series","term_group":0,"term_taxonomy_id":940,"taxonomy":"post_tag","description":"","parent":0,"count":190,"filter":"raw"},{"term_id":894,"name":"artificial intelligence","slug":"artificial-intelligence","term_group":0,"term_taxonomy_id":904,"taxonomy":"post_tag","description":"","parent":0,"count":185,"filter":"raw"},{"term_id":1053,"name":"gemma4","slug":"gemma4","term_group":0,"term_taxonomy_id":1063,"taxonomy":"post_tag","description":"","parent":0,"count":6,"filter":"raw"},{"term_id":876,"name":"LLM","slug":"llm","term_group":0,"term_taxonomy_id":886,"taxonomy":"post_tag","description":"","parent":0,"count":20,"filter":"raw"},{"term_id":986,"name":"Local AI","slug":"local-ai","term_group":0,"term_taxonomy_id":996,"taxonomy":"post_tag","description":"","parent":0,"count":51,"filter":"raw"}],"_links":{"self":[{"href":"https:\/\/jorgep.com\/blog\/wp-json\/wp\/v2\/posts\/520609","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/jorgep.com\/blog\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/jorgep.com\/blog\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/jorgep.com\/blog\/wp-json\/wp\/v2\/users\/2"}],"replies":[{"embeddable":true,"href":"https:\/\/jorgep.com\/blog\/wp-json\/wp\/v2\/comments?post=520609"}],"version-history":[{"count":6,"href":"https:\/\/jorgep.com\/blog\/wp-json\/wp\/v2\/posts\/520609\/revisions"}],"predecessor-version":[{"id":520646,"href":"https:\/\/jorgep.com\/blog\/wp-json\/wp\/v2\/posts\/520609\/revisions\/520646"}],"wp:featuredmedia":[{"embeddable":true,"href":"https:\/\/jorgep.com\/blog\/wp-json\/wp\/v2\/media\/427864"}],"wp:attachment":[{"href":"https:\/\/jorgep.com\/blog\/wp-json\/wp\/v2\/media?parent=520609"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/jorgep.com\/blog\/wp-json\/wp\/v2\/categories?post=520609"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/jorgep.com\/blog\/wp-json\/wp\/v2\/tags?post=520609"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}