 {"id":521354,"date":"2025-09-20T09:30:00","date_gmt":"2025-09-20T16:30:00","guid":{"rendered":"https:\/\/jorgep.com\/blog\/?p=521354"},"modified":"2026-06-30T12:12:02","modified_gmt":"2026-06-30T19:12:02","slug":"the-small-model-paradox-solving-the-local-ai-context-crunch","status":"publish","type":"post","link":"https:\/\/jorgep.com\/blog\/the-small-model-paradox-solving-the-local-ai-context-crunch\/","title":{"rendered":"The Small Model Paradox: Solving the Local AI Context Crunch"},"content":{"rendered":"\n<div class=\"wp-block-columns has-theme-palette-7-background-color has-background is-layout-flex wp-container-core-columns-is-layout-5dc627e1 wp-block-columns-is-layout-flex\" style=\"margin-top:0;margin-bottom:0\">\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\" style=\"flex-basis:80%\">\n<p class=\"wp-block-paragraph\">Part of: <strong> <a href=\"https:\/\/jorgep.com\/blog\/series-ai-learnings\/\">AI Learning Series Here<\/a><\/strong><\/p>\n\n\n<style>.kadence-column395113_e6e0a6-a4 > .kt-inside-inner-col,.kadence-column395113_e6e0a6-a4 > .kt-inside-inner-col:before{border-top-left-radius:0px;border-top-right-radius:0px;border-bottom-right-radius:0px;border-bottom-left-radius:0px;}.kadence-column395113_e6e0a6-a4 > .kt-inside-inner-col{column-gap:var(--global-kb-gap-sm, 1rem);}.kadence-column395113_e6e0a6-a4 > .kt-inside-inner-col{flex-direction:column;}.kadence-column395113_e6e0a6-a4 > .kt-inside-inner-col > .aligncenter{width:100%;}.kadence-column395113_e6e0a6-a4 > .kt-inside-inner-col:before{opacity:0.3;}.kadence-column395113_e6e0a6-a4{position:relative;}@media all and (max-width: 1024px){.kadence-column395113_e6e0a6-a4 > .kt-inside-inner-col{flex-direction:column;justify-content:center;}}@media all and (max-width: 767px){.kadence-column395113_e6e0a6-a4 > .kt-inside-inner-col{flex-direction:column;justify-content:center;}}<\/style>\n<div class=\"wp-block-kadence-column kadence-column395113_e6e0a6-a4\"><div class=\"kt-inside-inner-col\"><style>.kadence-column510545_f73041-db > .kt-inside-inner-col{padding-top:var(--global-kb-spacing-xs, 1rem);padding-bottom:var(--global-kb-spacing-xs, 1rem);}.kadence-column510545_f73041-db > .kt-inside-inner-col,.kadence-column510545_f73041-db > .kt-inside-inner-col:before{border-top-left-radius:0px;border-top-right-radius:0px;border-bottom-right-radius:0px;border-bottom-left-radius:0px;}.kadence-column510545_f73041-db > .kt-inside-inner-col{column-gap:var(--global-kb-gap-sm, 1rem);}.kadence-column510545_f73041-db > .kt-inside-inner-col{flex-direction:column;}.kadence-column510545_f73041-db > .kt-inside-inner-col > .aligncenter{width:100%;}.kadence-column510545_f73041-db > .kt-inside-inner-col{background-color:var(--global-palette7, #EDF2F7);}.kadence-column510545_f73041-db:hover > .kt-inside-inner-col{background-color:var(--global-palette8, #F7FAFC);background-image:none;}.kadence-column510545_f73041-db > .kt-inside-inner-col:before{opacity:0.3;}.kadence-column510545_f73041-db{position:relative;}@media all and (max-width: 1024px){.kadence-column510545_f73041-db > .kt-inside-inner-col{flex-direction:column;justify-content:center;}}@media all and (max-width: 767px){.kadence-column510545_f73041-db > .kt-inside-inner-col{flex-direction:column;justify-content:center;}}<\/style>\n<div class=\"wp-block-kadence-column kadence-column510545_f73041-db\"><div class=\"kt-inside-inner-col\"><style>.wp-block-kadence-advancedheading.kt-adv-heading510545_c22cc7-a4, .wp-block-kadence-advancedheading.kt-adv-heading510545_c22cc7-a4[data-kb-block=\"kb-adv-heading510545_c22cc7-a4\"]{text-align:center;font-size:var(--global-kb-font-size-sm, 0.9rem);font-style:normal;}.wp-block-kadence-advancedheading.kt-adv-heading510545_c22cc7-a4 mark.kt-highlight, .wp-block-kadence-advancedheading.kt-adv-heading510545_c22cc7-a4[data-kb-block=\"kb-adv-heading510545_c22cc7-a4\"] mark.kt-highlight{font-style:normal;color:#f76a0c;-webkit-box-decoration-break:clone;box-decoration-break:clone;padding-top:0px;padding-right:0px;padding-bottom:0px;padding-left:0px;}.wp-block-kadence-advancedheading.kt-adv-heading510545_c22cc7-a4 img.kb-inline-image, .wp-block-kadence-advancedheading.kt-adv-heading510545_c22cc7-a4[data-kb-block=\"kb-adv-heading510545_c22cc7-a4\"] img.kb-inline-image{width:150px;vertical-align:baseline;}<\/style>\n<p class=\"kt-adv-heading510545_c22cc7-a4 wp-block-kadence-advancedheading\" data-kb-block=\"kb-adv-heading510545_c22cc7-a4\">Quick Links:&nbsp;<a href=\"https:\/\/jorgep.com\/blog\/resources-for-learning-ai\/\">Resources for Learning AI<\/a> | <a href=\"https:\/\/jorgep.com\/blog\/keeping-up-with-ai\/\">Keep up with AI<\/a> | <a href=\"https:\/\/jorgep.com\/blog\/list-of-ai-tools\/\" data-type=\"post\" data-id=\"402818\">List of AI Tools<\/a> | <a href=\"https:\/\/jorgep.com\/blog\/local-ai-series\/\" data-type=\"page\" data-id=\"519365\">Local AI<\/a> | <a href=\"https:\/\/jorgep.com\/blog\/tag\/ai-agents\/\" data-type=\"post_tag\" data-id=\"941\">AI Agents<\/a> |  <a href=\"https:\/\/jorgep.com\/blog\/work-beyond-tomorrow-series\/\" data-type=\"page\" data-id=\"365001\">Future of Work<\/a><\/p>\n<\/div><\/div>\n<\/div><\/div>\n\n\n<style>.kb-row-layout-id395113_97845d-28 > .kt-row-column-wrap{align-content:start;}:where(.kb-row-layout-id395113_97845d-28 > .kt-row-column-wrap) > .wp-block-kadence-column{justify-content:start;}.kb-row-layout-id395113_97845d-28 > .kt-row-column-wrap{column-gap:var(--global-kb-gap-md, 2rem);row-gap:var(--global-kb-gap-none, 0rem );padding-top:var(--global-kb-spacing-xxs, 0.5rem);padding-bottom:var(--global-kb-spacing-xxs, 0.5rem);grid-template-columns:repeat(2, minmax(0, 1fr));}.kb-row-layout-id395113_97845d-28 > .kt-row-layout-overlay{opacity:0.30;}@media all and (max-width: 1024px){.kb-row-layout-id395113_97845d-28 > .kt-row-column-wrap{grid-template-columns:repeat(2, minmax(0, 1fr));}}@media all and (max-width: 767px){.kb-row-layout-id395113_97845d-28 > .kt-row-column-wrap{grid-template-columns:minmax(0, 1fr);}}<\/style><div class=\"kb-row-layout-wrap kb-row-layout-id395113_97845d-28 alignnone wp-block-kadence-rowlayout\"><div class=\"kt-row-column-wrap kt-has-2-columns kt-row-layout-equal kt-tab-layout-inherit kt-mobile-layout-row kt-row-valign-top\">\n<style>.kadence-column395113_fb3852-97 > .kt-inside-inner-col,.kadence-column395113_fb3852-97 > .kt-inside-inner-col:before{border-top-left-radius:0px;border-top-right-radius:0px;border-bottom-right-radius:0px;border-bottom-left-radius:0px;}.kadence-column395113_fb3852-97 > .kt-inside-inner-col{column-gap:var(--global-kb-gap-sm, 1rem);}.kadence-column395113_fb3852-97 > .kt-inside-inner-col{flex-direction:column;}.kadence-column395113_fb3852-97 > .kt-inside-inner-col > .aligncenter{width:100%;}.kadence-column395113_fb3852-97 > .kt-inside-inner-col:before{opacity:0.3;}.kadence-column395113_fb3852-97{position:relative;}@media all and (max-width: 1024px){.kadence-column395113_fb3852-97 > .kt-inside-inner-col{flex-direction:column;justify-content:center;}}@media all and (max-width: 767px){.kadence-column395113_fb3852-97 > .kt-inside-inner-col{flex-direction:column;justify-content:center;}}<\/style>\n<div class=\"wp-block-kadence-column kadence-column395113_fb3852-97\"><div class=\"kt-inside-inner-col\"><style>.wp-block-kadence-advancedheading.kt-adv-heading395113_0b34c1-ff, .wp-block-kadence-advancedheading.kt-adv-heading395113_0b34c1-ff[data-kb-block=\"kb-adv-heading395113_0b34c1-ff\"]{text-align:center;font-size:var(--global-kb-font-size-sm, 0.9rem);line-height:60px;font-style:normal;background-color:#f5a511;}.wp-block-kadence-advancedheading.kt-adv-heading395113_0b34c1-ff mark.kt-highlight, .wp-block-kadence-advancedheading.kt-adv-heading395113_0b34c1-ff[data-kb-block=\"kb-adv-heading395113_0b34c1-ff\"] mark.kt-highlight{font-style:normal;color:#f76a0c;-webkit-box-decoration-break:clone;box-decoration-break:clone;padding-top:0px;padding-right:0px;padding-bottom:0px;padding-left:0px;}.wp-block-kadence-advancedheading.kt-adv-heading395113_0b34c1-ff img.kb-inline-image, .wp-block-kadence-advancedheading.kt-adv-heading395113_0b34c1-ff[data-kb-block=\"kb-adv-heading395113_0b34c1-ff\"] img.kb-inline-image{width:150px;vertical-align:baseline;}<\/style>\n<p class=\"kt-adv-heading395113_0b34c1-ff wp-block-kadence-advancedheading\" data-kb-block=\"kb-adv-heading395113_0b34c1-ff\">Subscribe to <a href=\"https:\/\/go.35s.be\/jtb\" target=\"_blank\" rel=\"noreferrer noopener\"><strong>JorgeTechBits  newsletter<\/strong><\/a><\/p>\n<\/div><\/div>\n\n\n<style>.kadence-column395113_1641f9-51 > .kt-inside-inner-col,.kadence-column395113_1641f9-51 > .kt-inside-inner-col:before{border-top-left-radius:0px;border-top-right-radius:0px;border-bottom-right-radius:0px;border-bottom-left-radius:0px;}.kadence-column395113_1641f9-51 > .kt-inside-inner-col{column-gap:var(--global-kb-gap-sm, 1rem);}.kadence-column395113_1641f9-51 > .kt-inside-inner-col{flex-direction:column;}.kadence-column395113_1641f9-51 > .kt-inside-inner-col > .aligncenter{width:100%;}.kadence-column395113_1641f9-51 > .kt-inside-inner-col:before{opacity:0.3;}.kadence-column395113_1641f9-51{position:relative;}@media all and (max-width: 1024px){.kadence-column395113_1641f9-51 > .kt-inside-inner-col{flex-direction:column;justify-content:center;}}@media all and (max-width: 767px){.kadence-column395113_1641f9-51 > .kt-inside-inner-col{flex-direction:column;justify-content:center;}}<\/style>\n<div class=\"wp-block-kadence-column kadence-column395113_1641f9-51\"><div class=\"kt-inside-inner-col\"><style>.wp-block-kadence-advancedheading.kt-adv-heading395113_63dab8-35, .wp-block-kadence-advancedheading.kt-adv-heading395113_63dab8-35[data-kb-block=\"kb-adv-heading395113_63dab8-35\"]{text-align:center;font-size:var(--global-kb-font-size-sm, 0.9rem);font-style:normal;}.wp-block-kadence-advancedheading.kt-adv-heading395113_63dab8-35 mark.kt-highlight, .wp-block-kadence-advancedheading.kt-adv-heading395113_63dab8-35[data-kb-block=\"kb-adv-heading395113_63dab8-35\"] mark.kt-highlight{font-style:normal;color:#f76a0c;-webkit-box-decoration-break:clone;box-decoration-break:clone;padding-top:0px;padding-right:0px;padding-bottom:0px;padding-left:0px;}.wp-block-kadence-advancedheading.kt-adv-heading395113_63dab8-35 img.kb-inline-image, .wp-block-kadence-advancedheading.kt-adv-heading395113_63dab8-35[data-kb-block=\"kb-adv-heading395113_63dab8-35\"] img.kb-inline-image{width:150px;vertical-align:baseline;}<\/style>\n<p class=\"kt-adv-heading395113_63dab8-35 wp-block-kadence-advancedheading\" data-kb-block=\"kb-adv-heading395113_63dab8-35\">Explore the <a href=\"https:\/\/jorgep.com\/blog\/latest-token-prices\/\" data-type=\"page\" data-id=\"521255\">Latest Token Prices<\/a><\/p>\n<\/div><\/div>\n\n<\/div><\/div><\/div>\n\n\n\n<div class=\"wp-block-column is-vertically-aligned-top is-layout-flow wp-block-column-is-layout-flow\"><div class=\"wp-block-image\">\n<figure class=\"aligncenter size-large is-resized\"><a href=\"htthttps:\/\/jorgep.com\/blog\/book-dont-just-chat-delegate\/\"><img loading=\"lazy\" decoding=\"async\" width=\"640\" height=\"1024\" src=\"https:\/\/jorgep.com\/blog\/wp-content\/uploads\/CoverBook-01-640x1024.jpg\" alt=\"\" class=\"wp-image-520234\" style=\"aspect-ratio:0.6250142320391666;width:98px;height:auto\" srcset=\"https:\/\/jorgep.com\/blog\/wp-content\/uploads\/CoverBook-01-640x1024.jpg 640w, https:\/\/jorgep.com\/blog\/wp-content\/uploads\/CoverBook-01-188x300.jpg 188w, https:\/\/jorgep.com\/blog\/wp-content\/uploads\/CoverBook-01-768x1229.jpg 768w, https:\/\/jorgep.com\/blog\/wp-content\/uploads\/CoverBook-01-960x1536.jpg 960w, https:\/\/jorgep.com\/blog\/wp-content\/uploads\/CoverBook-01-1280x2048.jpg 1280w, https:\/\/jorgep.com\/blog\/wp-content\/uploads\/CoverBook-01.jpg 1600w\" sizes=\"auto, (max-width: 640px) 100vw, 640px\" \/><\/a><figcaption class=\"wp-element-caption\"><a href=\"https:\/\/jorgep.com\/blog\/book-series-ai-dont-just-chat\/\" data-type=\"page\" data-id=\"520242\">Check out the Book Series<\/a><\/figcaption><\/figure>\n<\/div><\/div>\n<\/div>\n\n\n<style>.wp-block-kadence-advancedheading.kt-adv-heading519190_b33a00-c9, .wp-block-kadence-advancedheading.kt-adv-heading519190_b33a00-c9[data-kb-block=\"kb-adv-heading519190_b33a00-c9\"]{font-size:var(--global-kb-font-size-sm, 0.9rem);font-style:normal;}.wp-block-kadence-advancedheading.kt-adv-heading519190_b33a00-c9 mark.kt-highlight, .wp-block-kadence-advancedheading.kt-adv-heading519190_b33a00-c9[data-kb-block=\"kb-adv-heading519190_b33a00-c9\"] mark.kt-highlight{font-style:normal;color:#f76a0c;-webkit-box-decoration-break:clone;box-decoration-break:clone;padding-top:0px;padding-right:0px;padding-bottom:0px;padding-left:0px;}.wp-block-kadence-advancedheading.kt-adv-heading519190_b33a00-c9 img.kb-inline-image, .wp-block-kadence-advancedheading.kt-adv-heading519190_b33a00-c9[data-kb-block=\"kb-adv-heading519190_b33a00-c9\"] img.kb-inline-image{width:150px;vertical-align:baseline;}<\/style>\n<p class=\"kt-adv-heading519190_b33a00-c9 wp-block-kadence-advancedheading\" data-kb-block=\"kb-adv-heading519190_b33a00-c9\"><strong>Disclaimer:<\/strong> <strong>I create this content entirely on my own time, and the views expressed here are mine alone (not my employer&#8217;s)<\/strong>. Because I love leveraging new tech, I use AI tools like Gemini, NotebookLM, Claude, Perplexity and others as a &#8220;digital team&#8221; to help research and polish these articles so I can share the best possible insights with you!<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">I love to experiment, tear things apart, and figure out exactly how things work under the hood. For me, the best way to truly understand a technology is by doing\u2014which is why I prefer developing my own custom tools rather than just relying on pre-built software. While some polished front-end interfaces already have built-in memory management, building my own custom app from scratch has revealed the fascinating, messy reality of local AI limitations.<\/p>\n\n\n\n<p class=\"wp-block-paragraph\"><\/p>\n\n\n\n<p class=\"wp-block-paragraph\">Please see  <a href=\"https:\/\/jorgep.com\/blog\/small-local-models-why-tiny-ai-is-having-a-big-moment\/\" data-type=\"post\" data-id=\"520609\">Small Local Models: Why Tiny AI Is Having a Big Moment<\/a> and other <a href=\"https:\/\/jorgep.com\/blog\/local-ai-series\/\" data-type=\"page\" data-id=\"519365\">Local AI Series<\/a> <\/p>\n\n\n\n<p class=\"wp-block-paragraph\"><\/p>\n\n\n\n<p class=\"wp-block-paragraph\"><\/p>\n\n\n\n<p class=\"wp-block-paragraph\">Running AI entirely on your local machine is the ultimate setup for privacy, speed, and deep customization. But if you use local AI for long-form brainstorming, coding, or complex ideation sessions, you will quickly hit a frustrating wall: <strong>the context window crash.<\/strong><\/p>\n\n\n\n<p class=\"wp-block-paragraph\">Smaller, highly optimized local models are fast and incredibly efficient. However, because their context windows are tight, they tend to &#8220;forget&#8221; the beginning of your conversation just as your ideas are starting to get good.<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">Here is why this context crunch happens with local AI, and how you can programmatically solve it in your own custom applications.<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">The Challenge: The Overhead of Ideation<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">When you brainstorm, conversations are messy. They are filled with pleasantries, discarded ideas, tangents, and repetitive phrasing.<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">For a massive cloud-based AI, this conversational filler is a drop in the bucket. But for a lightweight, local model, every single token matters. As your chat history grows, several critical bottlenecks occur:<\/p>\n\n\n\n<p class=\"wp-block-paragraph\"><\/p>\n\n\n\n<ol class=\"wp-block-list\">\n<li><strong>The Model Slows Down:<\/strong> Processing a bloated chat history requires more compute power, causing generation speeds (tokens per second) to drop significantly.<\/li>\n\n\n\n<li><strong>Hallucinations Rise:<\/strong> Smaller models struggle to parse through massive, messy data streams, leading to confusion, missed instructions, and factual errors.<\/li>\n\n\n\n<li><strong>The Repeat-Loop Trap:<\/strong> When smaller models get overwhelmed by a massive context, they often fall into a &#8220;repetition loop.&#8221; They get stuck repeating the same phrase, bullet point, or piece of code over and over again, completely wasting tokens and breaking the interaction.<\/li>\n\n\n\n<li><strong>The Hard Wall:<\/strong> The model completely runs out of space, resulting in a crashed session, broken JSON payloads, or total memory loss.<\/li>\n<\/ol>\n\n\n\n<p class=\"wp-block-paragraph\">To build a seamless local AI experience, your application needs to handle memory management automatically in the background.<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">The Solution: A Technical Blueprint for Context Compaction<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">You do not need a bigger model to solve this problem; you just need smarter engineering. By implementing a <strong>Continuous Compaction<\/strong> architecture in your custom application, you can give your local AI an optimized, near-infinite memory.<\/p>\n\n\n\n<p class=\"wp-block-paragraph\"><\/p>\n\n\n<style>.wp-block-kadence-advancedheading.kt-adv-heading521354_e09b27-02, .wp-block-kadence-advancedheading.kt-adv-heading521354_e09b27-02[data-kb-block=\"kb-adv-heading521354_e09b27-02\"]{font-style:normal;}.wp-block-kadence-advancedheading.kt-adv-heading521354_e09b27-02 mark.kt-highlight, .wp-block-kadence-advancedheading.kt-adv-heading521354_e09b27-02[data-kb-block=\"kb-adv-heading521354_e09b27-02\"] mark.kt-highlight{font-style:normal;color:#f76a0c;-webkit-box-decoration-break:clone;box-decoration-break:clone;padding-top:0px;padding-right:0px;padding-bottom:0px;padding-left:0px;}.wp-block-kadence-advancedheading.kt-adv-heading521354_e09b27-02 img.kb-inline-image, .wp-block-kadence-advancedheading.kt-adv-heading521354_e09b27-02[data-kb-block=\"kb-adv-heading521354_e09b27-02\"] img.kb-inline-image{width:150px;vertical-align:baseline;}<\/style>\n<p class=\"kt-adv-heading521354_e09b27-02 wp-block-kadence-advancedheading\" data-kb-block=\"kb-adv-heading521354_e09b27-02\"><strong>1. Implement a Dual-Threshold Watermark<\/strong><\/p>\n\n\n\n<p class=\"wp-block-paragraph\">Do not wait for the context window to fill up completely. Set up an automated trigger system in your app&#8217;s chat-management logic:<\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li><strong>The Soft Limit (70% capacity):<\/strong> When the conversation fills 70% of the maximum token limit, fire an asynchronous background function to compress the oldest parts of the history.<\/li>\n\n\n\n<li><strong>The Hard Limit (95% capacity):<\/strong> If you are typing rapidly and the compression hasn&#8217;t finished, forcefully truncate the oldest messages as a fallback safety measure to prevent application crashes and halt repetition loops.<\/li>\n<\/ul>\n\n\n<style>.wp-block-kadence-advancedheading.kt-adv-heading521354_c6ce87-31, .wp-block-kadence-advancedheading.kt-adv-heading521354_c6ce87-31[data-kb-block=\"kb-adv-heading521354_c6ce87-31\"]{font-style:normal;}.wp-block-kadence-advancedheading.kt-adv-heading521354_c6ce87-31 mark.kt-highlight, .wp-block-kadence-advancedheading.kt-adv-heading521354_c6ce87-31[data-kb-block=\"kb-adv-heading521354_c6ce87-31\"] mark.kt-highlight{font-style:normal;color:#f76a0c;-webkit-box-decoration-break:clone;box-decoration-break:clone;padding-top:0px;padding-right:0px;padding-bottom:0px;padding-left:0px;}.wp-block-kadence-advancedheading.kt-adv-heading521354_c6ce87-31 img.kb-inline-image, .wp-block-kadence-advancedheading.kt-adv-heading521354_c6ce87-31[data-kb-block=\"kb-adv-heading521354_c6ce87-31\"] img.kb-inline-image{width:150px;vertical-align:baseline;}<\/style>\n<p class=\"kt-adv-heading521354_c6ce87-31 wp-block-kadence-advancedheading\" data-kb-block=\"kb-adv-heading521354_c6ce87-31\"><strong>2. Automated &#8220;Checkpoint&#8221; Summarization<\/strong><\/p>\n\n\n\n<p class=\"wp-block-paragraph\">When your soft limit hits, do not just delete old messages. Isolate the oldest chunk of the conversation (leaving the most recent 3 to 4 turns completely intact so the AI remembers the immediate conversational flow).<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">Pass those older messages through a background prompt designed to strip out the fluff and distill the core data:<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">text<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>\"Summarize the following project ideation into a dense, token-efficient list \nof core decisions, requirements, and state variables. Avoid conversational filler.\"\n<\/code><\/pre>\n\n\n\n<p class=\"wp-block-paragraph\"><\/p>\n\n\n\n<p class=\"wp-block-paragraph\">Take that resulting summary, inject it into the conversation as a single <code>system<\/code> or <code>user<\/code> memory anchor, and discard the raw text of those old messages.<\/p>\n\n\n<style>.wp-block-kadence-advancedheading.kt-adv-heading521354_1394da-c6, .wp-block-kadence-advancedheading.kt-adv-heading521354_1394da-c6[data-kb-block=\"kb-adv-heading521354_1394da-c6\"]{font-style:normal;}.wp-block-kadence-advancedheading.kt-adv-heading521354_1394da-c6 mark.kt-highlight, .wp-block-kadence-advancedheading.kt-adv-heading521354_1394da-c6[data-kb-block=\"kb-adv-heading521354_1394da-c6\"] mark.kt-highlight{font-style:normal;color:#f76a0c;-webkit-box-decoration-break:clone;box-decoration-break:clone;padding-top:0px;padding-right:0px;padding-bottom:0px;padding-left:0px;}.wp-block-kadence-advancedheading.kt-adv-heading521354_1394da-c6 img.kb-inline-image, .wp-block-kadence-advancedheading.kt-adv-heading521354_1394da-c6[data-kb-block=\"kb-adv-heading521354_1394da-c6\"] img.kb-inline-image{width:150px;vertical-align:baseline;}<\/style>\n<p class=\"kt-adv-heading521354_1394da-c6 wp-block-kadence-advancedheading\" data-kb-block=\"kb-adv-heading521354_1394da-c6\"><strong>3. Maintain a Persistent State Object<\/strong><\/p>\n\n\n\n<p class=\"wp-block-paragraph\">During a long brainstorming session, certain details are set in stone (e.g., a project name, a chosen tech stack, or a specific feature list). Relying on the AI to remember these facts from a message sent 20 turns ago is a recipe for failure.<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">Instead, design your application to maintain a persistent <strong>State Object<\/strong> (a simple key-value dictionary) in your backend code.<\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li>As ideas are approved, extract them from the chat stream.<\/li>\n\n\n\n<li>Inject this structured state directly into the global system prompt on every single turn.<\/li>\n\n\n\n<li>This guarantees the AI always knows the absolute truth of the project parameters without needing to read the entire history to find them.<\/li>\n<\/ul>\n\n\n<style>.wp-block-kadence-advancedheading.kt-adv-heading521354_b2937e-e1, .wp-block-kadence-advancedheading.kt-adv-heading521354_b2937e-e1[data-kb-block=\"kb-adv-heading521354_b2937e-e1\"]{font-style:normal;}.wp-block-kadence-advancedheading.kt-adv-heading521354_b2937e-e1 mark.kt-highlight, .wp-block-kadence-advancedheading.kt-adv-heading521354_b2937e-e1[data-kb-block=\"kb-adv-heading521354_b2937e-e1\"] mark.kt-highlight{font-style:normal;color:#f76a0c;-webkit-box-decoration-break:clone;box-decoration-break:clone;padding-top:0px;padding-right:0px;padding-bottom:0px;padding-left:0px;}.wp-block-kadence-advancedheading.kt-adv-heading521354_b2937e-e1 img.kb-inline-image, .wp-block-kadence-advancedheading.kt-adv-heading521354_b2937e-e1[data-kb-block=\"kb-adv-heading521354_b2937e-e1\"] img.kb-inline-image{width:150px;vertical-align:baseline;}<\/style>\n<p class=\"kt-adv-heading521354_b2937e-e1 wp-block-kadence-advancedheading\" data-kb-block=\"kb-adv-heading521354_b2937e-e1\"><strong>4. Force a &#8220;Telegraphic&#8221; Writing Style<\/strong><\/p>\n\n\n\n<p class=\"wp-block-paragraph\">Outbound tokens count toward your context limit just as much as inbound tokens. If your local model is overly chatty, it is actively eating away its own memory.<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">Tighten your base system prompt to enforce strict output constraints:<\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li><strong>Ban Filler:<\/strong> Explicitly forbid conversational transitions, greetings, and generic confirmations (e.g., <em>&#8220;Sure, I can help with that!&#8221;<\/em>).<\/li>\n\n\n\n<li><strong>Telegraph Style:<\/strong> Command the model to respond in short, nested bullet points rather than dense paragraphs.<\/li>\n<\/ul>\n\n\n\n<h2 class=\"wp-block-heading\">High-Performance Local AI<\/h2>\n\n\n\n<p class=\"wp-block-paragraph\">By moving the burden of memory management from the AI&#8217;s internal attention mechanism to your application&#8217;s code, you get the best of both worlds: the blazing-fast speed of a lightweight local model, and the deep context tracking required for complex, long-form ideation. Building it yourself isn&#8217;t just a great way to learn\u2014it results in a highly optimized, custom tool built exactly for your workflow.<\/p>\n\n\n\n<p class=\"wp-block-paragraph\"><\/p>\n\n\n\n<h2 class=\"wp-block-heading\">Some additional tips: <\/h2>\n\n\n\n<h3 class=\"wp-block-heading\">Enforce &#8220;Compact Response&#8221; System Prompts<\/h3>\n\n\n\n<p class=\"wp-block-paragraph\">Smaller 1.2B models can be overly chatty if not strictly constrained. Update the <code>system<\/code> role prompt of your custom app to force brevity, which saves inbound and outbound token space: [<a href=\"https:\/\/www.reddit.com\/r\/LocalLLaMA\/comments\/1ekr75a\/why_is_nobody_taking_about_internlm_25_20b\/\">1<\/a>]<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">text<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>You are a compact ideation assistant.\nAdhere strictly to these formatting rules:\n- Speak in telegram style: no filler phrases, greetings, or transitions.\n- Use nested, short bullet points instead of full paragraphs.\n- If a concept is finalized, format it as: &#91;FINALIZED: Concept Name] -&gt; Description.\n<\/code><\/pre>\n\n\n\n<p class=\"wp-block-paragraph\"><\/p>\n\n\n\n<h3 class=\"wp-block-heading\">Execute a &#8220;Checkpoint&#8221; Prompt<\/h3>\n\n\n\n<p class=\"wp-block-paragraph\">Before your context window fills up completely, run a specific prompt to generate a dense, machine-readable summary.<\/p>\n\n\n\n<p class=\"wp-block-paragraph\"><strong>Copy and paste this prompt into your local AI:<\/strong><\/p>\n\n\n\n<blockquote class=\"wp-block-quote is-layout-flow wp-block-quote-is-layout-flow\">\n<p class=\"wp-block-paragraph\"><em>&#8220;We are running out of context space. Please generate a dense, structured &#8216;Brainstorming Checkpoint Summary&#8217; of our conversation so far. Include: 1) The core objective, 2) Established constraints, 3) Approved ideas\/decisions, 4) Discarded ideas (and why), and 5) The immediate next steps. Format it to be as token-efficient as possible for a future prompt.&#8221;<\/em><\/p>\n<\/blockquote>\n\n\n<style>.wp-block-kadence-spacer.kt-block-spacer-521354_66683a-0e .kt-block-spacer{height:92px;}.wp-block-kadence-spacer.kt-block-spacer-521354_66683a-0e .kt-divider{border-top-width:1px;height:1px;border-top-color:#eee;width:80%;border-top-style:solid;}<\/style>\n<div class=\"wp-block-kadence-spacer aligncenter kt-block-spacer-521354_66683a-0e\"><div class=\"kt-block-spacer kt-block-spacer-halign-center\"><hr class=\"kt-divider\"\/><\/div><\/div>\n\n\n<style>.kadence-column521354_2e2d9c-8d > .kt-inside-inner-col,.kadence-column521354_2e2d9c-8d > .kt-inside-inner-col:before{border-top-left-radius:0px;border-top-right-radius:0px;border-bottom-right-radius:0px;border-bottom-left-radius:0px;}.kadence-column521354_2e2d9c-8d > .kt-inside-inner-col{column-gap:var(--global-kb-gap-sm, 1rem);}.kadence-column521354_2e2d9c-8d > .kt-inside-inner-col{flex-direction:column;}.kadence-column521354_2e2d9c-8d > .kt-inside-inner-col > .aligncenter{width:100%;}.kadence-column521354_2e2d9c-8d > .kt-inside-inner-col:before{opacity:0.3;}.kadence-column521354_2e2d9c-8d{position:relative;}@media all and (max-width: 1024px){.kadence-column521354_2e2d9c-8d > .kt-inside-inner-col{flex-direction:column;justify-content:center;}}@media all and (max-width: 767px){.kadence-column521354_2e2d9c-8d > .kt-inside-inner-col{flex-direction:column;justify-content:center;}}<\/style>\n<div class=\"wp-block-kadence-column kadence-column521354_2e2d9c-8d\"><div class=\"kt-inside-inner-col\"><style>.wp-block-kadence-advancedheading.kt-adv-heading521354_01c32c-7f, .wp-block-kadence-advancedheading.kt-adv-heading521354_01c32c-7f[data-kb-block=\"kb-adv-heading521354_01c32c-7f\"]{padding-top:var(--global-kb-spacing-sm, 1.5rem);padding-right:var(--global-kb-spacing-sm, 1.5rem);padding-bottom:var(--global-kb-spacing-sm, 1.5rem);padding-left:var(--global-kb-spacing-sm, 1.5rem);font-size:var(--global-kb-font-size-sm, 0.9rem);font-style:normal;}.wp-block-kadence-advancedheading.kt-adv-heading521354_01c32c-7f mark.kt-highlight, .wp-block-kadence-advancedheading.kt-adv-heading521354_01c32c-7f[data-kb-block=\"kb-adv-heading521354_01c32c-7f\"] mark.kt-highlight{font-style:normal;color:#f76a0c;-webkit-box-decoration-break:clone;box-decoration-break:clone;padding-top:0px;padding-right:0px;padding-bottom:0px;padding-left:0px;}.wp-block-kadence-advancedheading.kt-adv-heading521354_01c32c-7f img.kb-inline-image, .wp-block-kadence-advancedheading.kt-adv-heading521354_01c32c-7f[data-kb-block=\"kb-adv-heading521354_01c32c-7f\"] img.kb-inline-image{width:150px;vertical-align:baseline;}<\/style>\n<p class=\"kt-adv-heading521354_01c32c-7f wp-block-kadence-advancedheading has-theme-palette-7-background-color has-background\" data-kb-block=\"kb-adv-heading521354_01c32c-7f\">Technology is evolving so fast it is creating significant digital divide which will accelerate in the near future. The world is transforming fast. I published a new book that sets the stage for this mindset change (not a how to book but a what and why book) <a href=\"https:\/\/jorgep.com\/blog\/book-dont-just-chat-delegate\/\"><em>Don\u2019t Just Chat with AI Delegate<\/em><\/a><em> part of the Series: <\/em><a href=\"https:\/\/dontjustchat.com\/\"><em>Don\u2019t Just Chat with AI<\/em><\/a><em> . Its available now in Amazon (Kindle and Paperback) and as an Audiobook- Eleven Labs &#8211; <\/em><a href=\"https:\/\/jorgep.com\/blog\/book-dont-just-chat-delegate\/\"><em>Check out the website<\/em><\/a><em> for links and more info!<\/em><\/p>\n<\/div><\/div>\n","protected":false},"excerpt":{"rendered":"<p>I love to experiment, tear things apart, and figure out exactly how things work under the hood. For me, the best way to truly understand a technology is by doing\u2014which is why I prefer developing my own custom tools rather than just relying on pre-built software. While some polished front-end interfaces already have built-in memory&#8230;<\/p>\n","protected":false},"author":2,"featured_media":521356,"comment_status":"closed","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"_kad_blocks_custom_css":"","_kad_blocks_head_custom_js":"","_kad_blocks_body_custom_js":"","_kad_blocks_footer_custom_js":"","ngg_post_thumbnail":0,"episode_type":"","audio_file":"","podmotor_file_id":"","podmotor_episode_id":"","cover_image":"","cover_image_id":"","duration":"","filesize":"","filesize_raw":"","date_recorded":"","explicit":"","block":"","itunes_episode_number":"","itunes_title":"","itunes_season_number":"","itunes_episode_type":"","_kad_post_transparent":"","_kad_post_title":"","_kad_post_layout":"","_kad_post_sidebar_id":"","_kad_post_content_style":"","_kad_post_vertical_padding":"","_kad_post_feature":"","_kad_post_feature_position":"","_kad_post_header":false,"_kad_post_footer":false,"_kad_post_classname":"","footnotes":""},"categories":[1031],"tags":[930],"class_list":["post-521354","post","type-post","status-publish","format-standard","has-post-thumbnail","hentry","category-ai-learnings-series","tag-ai-series"],"taxonomy_info":{"category":[{"value":1031,"label":"AI Learnings Series"}],"post_tag":[{"value":930,"label":"AI Series"}]},"featured_image_src_large":["https:\/\/jorgep.com\/blog\/wp-content\/uploads\/FeaturedSubstack-ContextSmallModels-1024x538.jpg",1024,538,true],"author_info":{"display_name":"Jorge Pereira","author_link":"https:\/\/jorgep.com\/blog\/author\/jorge\/"},"comment_info":0,"category_info":[{"term_id":1031,"name":"AI Learnings Series","slug":"ai-learnings-series","term_group":0,"term_taxonomy_id":1041,"taxonomy":"category","description":"","parent":0,"count":43,"filter":"raw","cat_ID":1031,"category_count":43,"category_description":"","cat_name":"AI Learnings Series","category_nicename":"ai-learnings-series","category_parent":0}],"tag_info":[{"term_id":930,"name":"AI Series","slug":"ai-series","term_group":0,"term_taxonomy_id":940,"taxonomy":"post_tag","description":"","parent":0,"count":219,"filter":"raw"}],"_links":{"self":[{"href":"https:\/\/jorgep.com\/blog\/wp-json\/wp\/v2\/posts\/521354","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/jorgep.com\/blog\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/jorgep.com\/blog\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/jorgep.com\/blog\/wp-json\/wp\/v2\/users\/2"}],"replies":[{"embeddable":true,"href":"https:\/\/jorgep.com\/blog\/wp-json\/wp\/v2\/comments?post=521354"}],"version-history":[{"count":1,"href":"https:\/\/jorgep.com\/blog\/wp-json\/wp\/v2\/posts\/521354\/revisions"}],"predecessor-version":[{"id":521355,"href":"https:\/\/jorgep.com\/blog\/wp-json\/wp\/v2\/posts\/521354\/revisions\/521355"}],"wp:featuredmedia":[{"embeddable":true,"href":"https:\/\/jorgep.com\/blog\/wp-json\/wp\/v2\/media\/521356"}],"wp:attachment":[{"href":"https:\/\/jorgep.com\/blog\/wp-json\/wp\/v2\/media?parent=521354"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/jorgep.com\/blog\/wp-json\/wp\/v2\/categories?post=521354"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/jorgep.com\/blog\/wp-json\/wp\/v2\/tags?post=521354"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}