{"id":521022,"date":"2026-05-31T00:22:24","date_gmt":"2026-05-31T07:22:24","guid":{"rendered":"https:\/\/jorgep.com\/blog\/?p=521022"},"modified":"2026-06-22T08:31:05","modified_gmt":"2026-06-22T15:31:05","slug":"unlocking-the-npu-fastflowlm","status":"publish","type":"post","link":"https:\/\/jorgep.com\/blog\/unlocking-the-npu-fastflowlm\/","title":{"rendered":"Unlocking the NPU: FastFlowLM"},"content":{"rendered":"\n<div class=\"wp-block-columns has-theme-palette-7-background-color has-background is-layout-flex wp-container-core-columns-is-layout-5dc627e1 wp-block-columns-is-layout-flex\" style=\"margin-top:0;margin-bottom:0\">\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\" style=\"flex-basis:80%\">\n<p class=\"wp-block-paragraph\">Part of: <strong> <a href=\"https:\/\/jorgep.com\/blog\/series-ai-learnings\/\">AI Learning Series Here<\/a><\/strong><\/p>\n\n\n<style>.kadence-column395113_e6e0a6-a4 > .kt-inside-inner-col,.kadence-column395113_e6e0a6-a4 > .kt-inside-inner-col:before{border-top-left-radius:0px;border-top-right-radius:0px;border-bottom-right-radius:0px;border-bottom-left-radius:0px;}.kadence-column395113_e6e0a6-a4 > .kt-inside-inner-col{column-gap:var(--global-kb-gap-sm, 1rem);}.kadence-column395113_e6e0a6-a4 > .kt-inside-inner-col{flex-direction:column;}.kadence-column395113_e6e0a6-a4 > .kt-inside-inner-col > .aligncenter{width:100%;}.kadence-column395113_e6e0a6-a4 > .kt-inside-inner-col:before{opacity:0.3;}.kadence-column395113_e6e0a6-a4{position:relative;}@media all and (max-width: 1024px){.kadence-column395113_e6e0a6-a4 > .kt-inside-inner-col{flex-direction:column;justify-content:center;}}@media all and (max-width: 767px){.kadence-column395113_e6e0a6-a4 > .kt-inside-inner-col{flex-direction:column;justify-content:center;}}<\/style>\n<div class=\"wp-block-kadence-column kadence-column395113_e6e0a6-a4\"><div class=\"kt-inside-inner-col\"><style>.kadence-column510545_f73041-db > .kt-inside-inner-col{padding-top:var(--global-kb-spacing-xs, 1rem);padding-bottom:var(--global-kb-spacing-xs, 1rem);}.kadence-column510545_f73041-db > .kt-inside-inner-col,.kadence-column510545_f73041-db > .kt-inside-inner-col:before{border-top-left-radius:0px;border-top-right-radius:0px;border-bottom-right-radius:0px;border-bottom-left-radius:0px;}.kadence-column510545_f73041-db > .kt-inside-inner-col{column-gap:var(--global-kb-gap-sm, 1rem);}.kadence-column510545_f73041-db > .kt-inside-inner-col{flex-direction:column;}.kadence-column510545_f73041-db > .kt-inside-inner-col > .aligncenter{width:100%;}.kadence-column510545_f73041-db > .kt-inside-inner-col{background-color:var(--global-palette7, #EDF2F7);}.kadence-column510545_f73041-db:hover > .kt-inside-inner-col{background-color:var(--global-palette8, #F7FAFC);background-image:none;}.kadence-column510545_f73041-db > .kt-inside-inner-col:before{opacity:0.3;}.kadence-column510545_f73041-db{position:relative;}@media all and (max-width: 1024px){.kadence-column510545_f73041-db > .kt-inside-inner-col{flex-direction:column;justify-content:center;}}@media all and (max-width: 767px){.kadence-column510545_f73041-db > .kt-inside-inner-col{flex-direction:column;justify-content:center;}}<\/style>\n<div class=\"wp-block-kadence-column kadence-column510545_f73041-db\"><div class=\"kt-inside-inner-col\"><style>.wp-block-kadence-advancedheading.kt-adv-heading510545_c22cc7-a4, .wp-block-kadence-advancedheading.kt-adv-heading510545_c22cc7-a4[data-kb-block=\"kb-adv-heading510545_c22cc7-a4\"]{text-align:center;font-size:var(--global-kb-font-size-sm, 0.9rem);font-style:normal;}.wp-block-kadence-advancedheading.kt-adv-heading510545_c22cc7-a4 mark.kt-highlight, .wp-block-kadence-advancedheading.kt-adv-heading510545_c22cc7-a4[data-kb-block=\"kb-adv-heading510545_c22cc7-a4\"] mark.kt-highlight{font-style:normal;color:#f76a0c;-webkit-box-decoration-break:clone;box-decoration-break:clone;padding-top:0px;padding-right:0px;padding-bottom:0px;padding-left:0px;}.wp-block-kadence-advancedheading.kt-adv-heading510545_c22cc7-a4 img.kb-inline-image, .wp-block-kadence-advancedheading.kt-adv-heading510545_c22cc7-a4[data-kb-block=\"kb-adv-heading510545_c22cc7-a4\"] img.kb-inline-image{width:150px;vertical-align:baseline;}<\/style>\n<p class=\"kt-adv-heading510545_c22cc7-a4 wp-block-kadence-advancedheading\" data-kb-block=\"kb-adv-heading510545_c22cc7-a4\">Quick Links:&nbsp;<a href=\"https:\/\/jorgep.com\/blog\/resources-for-learning-ai\/\">Resources for Learning AI<\/a> | <a href=\"https:\/\/jorgep.com\/blog\/keeping-up-with-ai\/\">Keep up with AI<\/a> | <a href=\"https:\/\/jorgep.com\/blog\/list-of-ai-tools\/\" data-type=\"post\" data-id=\"402818\">List of AI Tools<\/a> | <a href=\"https:\/\/jorgep.com\/blog\/local-ai-series\/\" data-type=\"page\" data-id=\"519365\">Local AI<\/a> | <a href=\"https:\/\/jorgep.com\/blog\/tag\/ai-agents\/\" data-type=\"post_tag\" data-id=\"941\">AI Agents<\/a> |  <a href=\"https:\/\/jorgep.com\/blog\/work-beyond-tomorrow-series\/\" data-type=\"page\" data-id=\"365001\">Future of Work<\/a><\/p>\n<\/div><\/div>\n<\/div><\/div>\n\n\n<style>.kb-row-layout-id395113_97845d-28 > .kt-row-column-wrap{align-content:start;}:where(.kb-row-layout-id395113_97845d-28 > .kt-row-column-wrap) > .wp-block-kadence-column{justify-content:start;}.kb-row-layout-id395113_97845d-28 > .kt-row-column-wrap{column-gap:var(--global-kb-gap-md, 2rem);row-gap:var(--global-kb-gap-none, 0rem );padding-top:var(--global-kb-spacing-xxs, 0.5rem);padding-bottom:var(--global-kb-spacing-xxs, 0.5rem);grid-template-columns:repeat(2, minmax(0, 1fr));}.kb-row-layout-id395113_97845d-28 > .kt-row-layout-overlay{opacity:0.30;}@media all and (max-width: 1024px){.kb-row-layout-id395113_97845d-28 > .kt-row-column-wrap{grid-template-columns:repeat(2, minmax(0, 1fr));}}@media all and (max-width: 767px){.kb-row-layout-id395113_97845d-28 > .kt-row-column-wrap{grid-template-columns:minmax(0, 1fr);}}<\/style><div class=\"kb-row-layout-wrap kb-row-layout-id395113_97845d-28 alignnone wp-block-kadence-rowlayout\"><div class=\"kt-row-column-wrap kt-has-2-columns kt-row-layout-equal kt-tab-layout-inherit kt-mobile-layout-row kt-row-valign-top\">\n<style>.kadence-column395113_fb3852-97 > .kt-inside-inner-col,.kadence-column395113_fb3852-97 > .kt-inside-inner-col:before{border-top-left-radius:0px;border-top-right-radius:0px;border-bottom-right-radius:0px;border-bottom-left-radius:0px;}.kadence-column395113_fb3852-97 > .kt-inside-inner-col{column-gap:var(--global-kb-gap-sm, 1rem);}.kadence-column395113_fb3852-97 > .kt-inside-inner-col{flex-direction:column;}.kadence-column395113_fb3852-97 > .kt-inside-inner-col > .aligncenter{width:100%;}.kadence-column395113_fb3852-97 > .kt-inside-inner-col:before{opacity:0.3;}.kadence-column395113_fb3852-97{position:relative;}@media all and (max-width: 1024px){.kadence-column395113_fb3852-97 > .kt-inside-inner-col{flex-direction:column;justify-content:center;}}@media all and (max-width: 767px){.kadence-column395113_fb3852-97 > .kt-inside-inner-col{flex-direction:column;justify-content:center;}}<\/style>\n<div class=\"wp-block-kadence-column kadence-column395113_fb3852-97\"><div class=\"kt-inside-inner-col\"><style>.wp-block-kadence-advancedheading.kt-adv-heading395113_0b34c1-ff, .wp-block-kadence-advancedheading.kt-adv-heading395113_0b34c1-ff[data-kb-block=\"kb-adv-heading395113_0b34c1-ff\"]{text-align:center;font-size:var(--global-kb-font-size-sm, 0.9rem);line-height:60px;font-style:normal;background-color:#f5a511;}.wp-block-kadence-advancedheading.kt-adv-heading395113_0b34c1-ff mark.kt-highlight, .wp-block-kadence-advancedheading.kt-adv-heading395113_0b34c1-ff[data-kb-block=\"kb-adv-heading395113_0b34c1-ff\"] mark.kt-highlight{font-style:normal;color:#f76a0c;-webkit-box-decoration-break:clone;box-decoration-break:clone;padding-top:0px;padding-right:0px;padding-bottom:0px;padding-left:0px;}.wp-block-kadence-advancedheading.kt-adv-heading395113_0b34c1-ff img.kb-inline-image, .wp-block-kadence-advancedheading.kt-adv-heading395113_0b34c1-ff[data-kb-block=\"kb-adv-heading395113_0b34c1-ff\"] img.kb-inline-image{width:150px;vertical-align:baseline;}<\/style>\n<p class=\"kt-adv-heading395113_0b34c1-ff wp-block-kadence-advancedheading\" data-kb-block=\"kb-adv-heading395113_0b34c1-ff\">Subscribe to <a href=\"https:\/\/go.35s.be\/jtb\" target=\"_blank\" rel=\"noreferrer noopener\"><strong>JorgeTechBits  newsletter<\/strong><\/a><\/p>\n<\/div><\/div>\n\n\n<style>.kadence-column395113_1641f9-51 > .kt-inside-inner-col,.kadence-column395113_1641f9-51 > .kt-inside-inner-col:before{border-top-left-radius:0px;border-top-right-radius:0px;border-bottom-right-radius:0px;border-bottom-left-radius:0px;}.kadence-column395113_1641f9-51 > .kt-inside-inner-col{column-gap:var(--global-kb-gap-sm, 1rem);}.kadence-column395113_1641f9-51 > .kt-inside-inner-col{flex-direction:column;}.kadence-column395113_1641f9-51 > .kt-inside-inner-col > .aligncenter{width:100%;}.kadence-column395113_1641f9-51 > .kt-inside-inner-col:before{opacity:0.3;}.kadence-column395113_1641f9-51{position:relative;}@media all and (max-width: 1024px){.kadence-column395113_1641f9-51 > .kt-inside-inner-col{flex-direction:column;justify-content:center;}}@media all and (max-width: 767px){.kadence-column395113_1641f9-51 > .kt-inside-inner-col{flex-direction:column;justify-content:center;}}<\/style>\n<div class=\"wp-block-kadence-column kadence-column395113_1641f9-51\"><div class=\"kt-inside-inner-col\"><style>.wp-block-kadence-advancedheading.kt-adv-heading395113_63dab8-35, .wp-block-kadence-advancedheading.kt-adv-heading395113_63dab8-35[data-kb-block=\"kb-adv-heading395113_63dab8-35\"]{text-align:center;font-size:var(--global-kb-font-size-sm, 0.9rem);font-style:normal;}.wp-block-kadence-advancedheading.kt-adv-heading395113_63dab8-35 mark.kt-highlight, .wp-block-kadence-advancedheading.kt-adv-heading395113_63dab8-35[data-kb-block=\"kb-adv-heading395113_63dab8-35\"] mark.kt-highlight{font-style:normal;color:#f76a0c;-webkit-box-decoration-break:clone;box-decoration-break:clone;padding-top:0px;padding-right:0px;padding-bottom:0px;padding-left:0px;}.wp-block-kadence-advancedheading.kt-adv-heading395113_63dab8-35 img.kb-inline-image, .wp-block-kadence-advancedheading.kt-adv-heading395113_63dab8-35[data-kb-block=\"kb-adv-heading395113_63dab8-35\"] img.kb-inline-image{width:150px;vertical-align:baseline;}<\/style>\n<p class=\"kt-adv-heading395113_63dab8-35 wp-block-kadence-advancedheading\" data-kb-block=\"kb-adv-heading395113_63dab8-35\">Explore the <a href=\"https:\/\/jorgep.com\/blog\/latest-token-prices\/\" data-type=\"page\" data-id=\"521255\">Latest Token Prices<\/a><\/p>\n<\/div><\/div>\n\n<\/div><\/div><\/div>\n\n\n\n<div class=\"wp-block-column is-vertically-aligned-top is-layout-flow wp-block-column-is-layout-flow\"><div class=\"wp-block-image\">\n<figure class=\"aligncenter size-large is-resized\"><a href=\"htthttps:\/\/jorgep.com\/blog\/book-dont-just-chat-delegate\/\"><img loading=\"lazy\" decoding=\"async\" width=\"640\" height=\"1024\" src=\"https:\/\/jorgep.com\/blog\/wp-content\/uploads\/CoverBook-01-640x1024.jpg\" alt=\"\" class=\"wp-image-520234\" style=\"aspect-ratio:0.6250142320391666;width:98px;height:auto\" srcset=\"https:\/\/jorgep.com\/blog\/wp-content\/uploads\/CoverBook-01-640x1024.jpg 640w, https:\/\/jorgep.com\/blog\/wp-content\/uploads\/CoverBook-01-188x300.jpg 188w, https:\/\/jorgep.com\/blog\/wp-content\/uploads\/CoverBook-01-768x1229.jpg 768w, https:\/\/jorgep.com\/blog\/wp-content\/uploads\/CoverBook-01-960x1536.jpg 960w, https:\/\/jorgep.com\/blog\/wp-content\/uploads\/CoverBook-01-1280x2048.jpg 1280w, https:\/\/jorgep.com\/blog\/wp-content\/uploads\/CoverBook-01.jpg 1600w\" sizes=\"auto, (max-width: 640px) 100vw, 640px\" \/><\/a><figcaption class=\"wp-element-caption\"><a href=\"https:\/\/jorgep.com\/blog\/book-series-ai-dont-just-chat\/\" data-type=\"page\" data-id=\"520242\">Check out the Book Series<\/a><\/figcaption><\/figure>\n<\/div><\/div>\n<\/div>\n\n\n<style>.wp-block-kadence-advancedheading.kt-adv-heading519190_b33a00-c9, .wp-block-kadence-advancedheading.kt-adv-heading519190_b33a00-c9[data-kb-block=\"kb-adv-heading519190_b33a00-c9\"]{font-size:var(--global-kb-font-size-sm, 0.9rem);font-style:normal;}.wp-block-kadence-advancedheading.kt-adv-heading519190_b33a00-c9 mark.kt-highlight, .wp-block-kadence-advancedheading.kt-adv-heading519190_b33a00-c9[data-kb-block=\"kb-adv-heading519190_b33a00-c9\"] mark.kt-highlight{font-style:normal;color:#f76a0c;-webkit-box-decoration-break:clone;box-decoration-break:clone;padding-top:0px;padding-right:0px;padding-bottom:0px;padding-left:0px;}.wp-block-kadence-advancedheading.kt-adv-heading519190_b33a00-c9 img.kb-inline-image, .wp-block-kadence-advancedheading.kt-adv-heading519190_b33a00-c9[data-kb-block=\"kb-adv-heading519190_b33a00-c9\"] img.kb-inline-image{width:150px;vertical-align:baseline;}<\/style>\n<p class=\"kt-adv-heading519190_b33a00-c9 wp-block-kadence-advancedheading\" data-kb-block=\"kb-adv-heading519190_b33a00-c9\"><strong>Disclaimer:<\/strong> <strong>I create this content entirely on my own time, and the views expressed here are mine alone (not my employer&#8217;s)<\/strong>. Because I love leveraging new tech, I use AI tools like Gemini, NotebookLM, Claude, Perplexity and others as a &#8220;digital team&#8221; to help research and polish these articles so I can share the best possible insights with you!<\/p>\n\n\n\n<h4 class=\"wp-block-heading\"><em>How I Bypassed Ollama and LM studio Limitations on my Ryzen AI NPU to Hit 50+ TPS <\/em><\/h4>\n\n\n\n<p class=\"wp-block-paragraph\">If you recently purchased a modern AI-PC, you bought into a promising vision: a dedicated, cutting-edge Neural Processing Unit (NPU) sitting right inside your silicon, designed to stream large language models (LLMs) smoothly without draining your battery or spinning up your GPU fans. But if you are like me, your initial attempt to tap into that hardware felt like hitting a brick wall.<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">I started where most enthusiasts do: downloading industry staples like <strong>Ollama<\/strong> and <strong>LM Studio<\/strong>. I assumed they would automatically detect the specialized NPU. Instead, they ignored it entirely. Because these tools are architected for general-purpose GPU execution (CUDA\/ROCm), my state-of-the-art Ryzen AI chip was sidelined. The system defaulted to heavy CPU emulation, yielding a sluggish, disappointing <strong>3 to 10 tokens per second (TPS)<\/strong>. It was inefficient, and left a massive piece of my processor sitting idle.<\/p>\n\n\n<style>.wp-block-kadence-advancedheading.kt-adv-heading521022_36870a-0e, .wp-block-kadence-advancedheading.kt-adv-heading521022_36870a-0e[data-kb-block=\"kb-adv-heading521022_36870a-0e\"]{font-size:var(--global-kb-font-size-md, 1.25rem);font-style:normal;}.wp-block-kadence-advancedheading.kt-adv-heading521022_36870a-0e mark.kt-highlight, .wp-block-kadence-advancedheading.kt-adv-heading521022_36870a-0e[data-kb-block=\"kb-adv-heading521022_36870a-0e\"] mark.kt-highlight{font-style:normal;color:#f76a0c;-webkit-box-decoration-break:clone;box-decoration-break:clone;padding-top:0px;padding-right:0px;padding-bottom:0px;padding-left:0px;}.wp-block-kadence-advancedheading.kt-adv-heading521022_36870a-0e img.kb-inline-image, .wp-block-kadence-advancedheading.kt-adv-heading521022_36870a-0e[data-kb-block=\"kb-adv-heading521022_36870a-0e\"] img.kb-inline-image{width:150px;vertical-align:baseline;}<\/style>\n<p class=\"kt-adv-heading521022_36870a-0e wp-block-kadence-advancedheading\" data-kb-block=\"kb-adv-heading521022_36870a-0e\">Then, I discovered <strong><a href=\"https:\/\/fastflowlm.com\/\">FastFlowLM (FLM)<\/a><\/strong>. (Video at the end of this blog post) <\/p>\n\n\n\n<p class=\"wp-block-paragraph\"><\/p>\n\n\n\n<p class=\"wp-block-paragraph\">Video at the end of this <\/p>\n\n\n\n<h3 class=\"wp-block-heading\">The Breakthrough: Native Hardware Optimization<\/h3>\n\n\n\n<p class=\"wp-block-paragraph\">FLM is built from the ground up to execute models natively on XDNA-based architectures. By leveraging AMD\u2019s specialized kernel drivers, it bypasses standard GPU-centric abstraction layers to communicate directly with the NPU silicon.<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">The difference was staggering. The moment I initiated the FLM environment, it claimed exclusive, low-overhead access to the NPU. My standard daily-driver models jumped from single-digit speeds to a consistent <strong>22 to 37 TPS<\/strong>. The generation was fluid, instant, and consumed virtually no background GPU resources. When I switched to edge-optimized small language models\u2014specifically Meta\u2019s <code>llama3.2:1b<\/code>\u2014the performance cracked an unbelievable <strong>50+ TPS<\/strong>.<\/p>\n\n\n\n<figure class=\"wp-block-table\"><table class=\"has-fixed-layout\"><thead><tr><th class=\"has-text-align-left\" data-align=\"left\">Inference Engine<\/th><th class=\"has-text-align-left\" data-align=\"left\">Target Hardware Block<\/th><th class=\"has-text-align-left\" data-align=\"left\">Avg. Performance<\/th><th class=\"has-text-align-left\" data-align=\"left\">User Experience<\/th><\/tr><\/thead><tbody><tr><td class=\"has-text-align-left\" data-align=\"left\">Ollama \/ LM Studio<\/td><td class=\"has-text-align-left\" data-align=\"left\">CPU (Fallback)<\/td><td class=\"has-text-align-left\" data-align=\"left\">3 \u2013 10 TPS<\/td><td class=\"has-text-align-left\" data-align=\"left\">Sluggish, high overhead<\/td><\/tr><tr><td class=\"has-text-align-left\" data-align=\"left\">FastFlowLM (FLM)<\/td><td class=\"has-text-align-left\" data-align=\"left\">Native Ryzen NPU<\/td><td class=\"has-text-align-left\" data-align=\"left\">22 \u2013 37 TPS<\/td><td class=\"has-text-align-left\" data-align=\"left\">Fluid, immediate<\/td><\/tr><tr><td class=\"has-text-align-left\" data-align=\"left\"><strong>FLM (Llama 3.2 1B)<\/strong><\/td><td class=\"has-text-align-left\" data-align=\"left\"><strong>Native Ryzen NPU<\/strong><\/td><td class=\"has-text-align-left\" data-align=\"left\"><strong>50+ TPS<\/strong><\/td><td class=\"has-text-align-left\" data-align=\"left\"><strong>Blindingly fast<\/strong><\/td><\/tr><\/tbody><\/table><\/figure>\n\n\n\n<h3 class=\"wp-block-heading\">Setting Up Your Environment<\/h3>\n\n\n\n<p class=\"wp-block-paragraph\">Configuration I used: <\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li>FastFlowLM  in the host machine &#8212; (the only one as of now to utilize NPU) \n<ul class=\"wp-block-list\">\n<li>unless you changed the default, Model will go into c:\\users\\&lt;user&gt;\\.flm\\models<\/li>\n\n\n\n<li>Started with Llama3,2b:1B  but later added llama3.2:3b and gemma4 and qwen3.5:4B and they workeed GREAT! <\/li>\n<\/ul>\n<\/li>\n\n\n\n<li><\/li>\n\n\n\n<li>Docker (because I like to keep my machine clean, and can configure many items virtually)\n<ul class=\"wp-block-list\">\n<li>Running OpenWebUI (see yml file below) for testing<\/li>\n\n\n\n<li>Added the Local LLM  endpoint to my <a href=\"https:\/\/35sites.com\/applications\/unified-aichat-hub\/\">Unified AI CHat Hub Application<\/a> (as per below)<\/li>\n\n\n\n<li><\/li>\n<\/ul>\n<\/li>\n<\/ul>\n\n\n\n<p class=\"wp-block-paragraph\">To get started, <\/p>\n\n\n\n<ol class=\"wp-block-list\">\n<li>you must first install the <strong>FLM Application<\/strong> directly on your Windows or Linux OS (Check out their <a href=\"https:\/\/github.com\/julienM77\/FastFlowLM\">GitHub repository<\/a>). This binary acts as the bridge between your model files and the NPU silicon.\n<ul class=\"wp-block-list\">\n<li>Start FLM  from the terminal using the following command:    <\/li>\n\n\n\n<li><\/li>\n<\/ul>\n<\/li>\n\n\n\n<li><strong>flm serve llama3.2:1b &#8211;host 0.0.0.0<\/strong>\n<ul class=\"wp-block-list\">\n<li><\/li>\n\n\n\n<li>Note:   If you use: the standard command:    flm server lamma3.2.1b  and do not use the &#8211;host 0.0.0.0 flag it will only bind itself to 127.0.01  (localhost)   which means you will not be able to access it from other computers.    <\/li>\n<\/ul>\n<\/li>\n\n\n\n<li>You are not able to interact with llamma3.2.1b   on the terminal.   if you open another terminal session and type FLM list  you will see all of the models available (a lot!)   for your machine.<\/li>\n\n\n\n<li>While FLM runs well in a terminal, most users prefer a GUI to display the server status   and to interface with the chat \n<ul class=\"wp-block-list\">\n<li>for a Display GUI of the Server ( where you can start\/stop, configure the servier and also show NPU usage: there is a tool called  <a href=\"https:\/\/github.com\/julienM77\/flm-companion\">FLM Companion<\/a>  (highgjly recommended!)  <\/li>\n\n\n\n<li>For a front end chatbot I integrated my FLM instance into  a containerized workflow using <strong>OpenWebUI<\/strong> and <strong>n8n<\/strong>. Because OpenWebUI expects an OpenAI-compliant API, you can point it directly to your local FLM serving port.<\/li>\n<\/ul>\n<\/li>\n<\/ol>\n\n\n\n<p class=\"wp-block-paragraph\">If you are using Docker, configure your <code>docker-compose.yml<\/code> like this:<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>services:\n  open-webui:\n    image: ghcr.io\/open-webui\/open-webui:main\n    container_name: open-webui\n    ports:\n      - \"3000:8080\"\n    environment:\n      - ENABLE_OLLAMA=False\n      - OPENAI_API_BASE_URL=http:\/\/host.docker.internal:52625\/v1\n      - OPENAI_API_KEY=flm\n    extra_hosts:\n      - \"host.docker.internal:host-gateway\"<\/code><\/pre>\n\n\n\n<blockquote class=\"wp-block-quote is-layout-flow wp-block-quote-is-layout-flow\">\n<p class=\"wp-block-paragraph\"><strong>\u26a0\ufe0f Crucial Note:<\/strong> Because Docker containers operate in an isolated network, routing is key. <code>ENABLE_OLLAMA=False<\/code> prevents connection timeouts, while <code>host.docker.internal<\/code> (and the <code>host-gateway<\/code> flag) maps the container bridge back to your host machine&#8217;s FLM port.<\/p>\n<\/blockquote>\n\n\n\n<p class=\"wp-block-paragraph\">Alternatively, if you have a different chatbot interface ( like my own: <a href=\"https:\/\/35sites.com\/applications\/unified-aichat-hub\/\">Unified Chat Hub<\/a>)    use <\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>http:&#47;&#47;&lt;your.IP.ADDRESS&gt;:52625\/v1\/chat\/completions<\/code><\/pre>\n\n\n\n<p class=\"wp-block-paragraph\"><\/p>\n\n\n\n<p class=\"wp-block-paragraph\"><\/p>\n\n\n\n<h3 class=\"wp-block-heading\">Analyzing Real-Time Execution<\/h3>\n\n\n\n<p class=\"wp-block-paragraph\">When a request hits your stack, the handoff is immediate. The execution logs confirm the NPU engagement:<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>&#91;\ud83d\udd17 ] TCP connection established - Remote: 127.0.0.1:61894\n&#91;\ud83d\udfe2 ] NPU Locked!\n&#91;FLM] Model: gemma4-it:e4b\n&#91;FLM] Start prefill... (14 tokens)\n&#91;FLM] Start generating...<\/code><\/pre>\n\n\n\n<p class=\"wp-block-paragraph\">The <code>[\ud83d\udfe2 ] NPU Locked!<\/code> status indicates that FLM has successfully bypassed the CPU\/GPU bottleneck and established a direct hardware lock. The model prefill phase caches your context into high-speed memory, allowing subsequent inference to leverage the NPU for near-instant response times.<\/p>\n\n\n\n<p class=\"wp-block-paragraph\"><\/p>\n\n\n\n<h3 class=\"wp-block-heading\">Quick Video I created here: <\/h3>\n\n\n<style>.kb-row-layout-id521022_f9a805-9f > .kt-row-column-wrap{align-content:start;}:where(.kb-row-layout-id521022_f9a805-9f > .kt-row-column-wrap) > .wp-block-kadence-column{justify-content:start;}.kb-row-layout-id521022_f9a805-9f > .kt-row-column-wrap{column-gap:var(--global-kb-gap-md, 2rem);row-gap:var(--global-kb-gap-md, 2rem);padding-top:var(--global-kb-spacing-sm, 1.5rem);padding-bottom:var(--global-kb-spacing-sm, 1.5rem);}.kb-row-layout-id521022_f9a805-9f > .kt-row-column-wrap > div:not(.added-for-specificity){grid-column:initial;}.kb-row-layout-id521022_f9a805-9f > .kt-row-column-wrap{grid-template-columns:minmax(0, 1fr) minmax(0, 1fr) minmax(0, 2fr);}.kb-row-layout-id521022_f9a805-9f > .kt-row-layout-overlay{opacity:0.30;}@media all and (max-width: 1024px){.kb-row-layout-id521022_f9a805-9f > .kt-row-column-wrap > div:not(.added-for-specificity){grid-column:initial;}}@media all and (max-width: 1024px){.kb-row-layout-id521022_f9a805-9f > .kt-row-column-wrap{grid-template-columns:minmax(0, 1fr) minmax(0, 1fr) minmax(0, 2fr);}}@media all and (max-width: 767px){.kb-row-layout-id521022_f9a805-9f > .kt-row-column-wrap > div:not(.added-for-specificity){grid-column:initial;}.kb-row-layout-id521022_f9a805-9f > .kt-row-column-wrap{grid-template-columns:minmax(0, 1fr);}}<\/style><div class=\"kb-row-layout-wrap kb-row-layout-id521022_f9a805-9f alignnone wp-block-kadence-rowlayout\"><div class=\"kt-row-column-wrap kt-has-3-columns kt-row-layout-right-half kt-tab-layout-inherit kt-mobile-layout-row kt-row-valign-top\">\n<style>.kadence-column521022_93eeb8-85 > .kt-inside-inner-col,.kadence-column521022_93eeb8-85 > .kt-inside-inner-col:before{border-top-left-radius:0px;border-top-right-radius:0px;border-bottom-right-radius:0px;border-bottom-left-radius:0px;}.kadence-column521022_93eeb8-85 > .kt-inside-inner-col{column-gap:var(--global-kb-gap-sm, 1rem);}.kadence-column521022_93eeb8-85 > .kt-inside-inner-col{flex-direction:column;}.kadence-column521022_93eeb8-85 > .kt-inside-inner-col > .aligncenter{width:100%;}.kadence-column521022_93eeb8-85 > .kt-inside-inner-col:before{opacity:0.3;}.kadence-column521022_93eeb8-85{position:relative;}@media all and (max-width: 1024px){.kadence-column521022_93eeb8-85 > .kt-inside-inner-col{flex-direction:column;justify-content:center;}}@media all and (max-width: 767px){.kadence-column521022_93eeb8-85 > .kt-inside-inner-col{flex-direction:column;justify-content:center;}}<\/style>\n<div class=\"wp-block-kadence-column kadence-column521022_93eeb8-85\"><div class=\"kt-inside-inner-col\"><\/div><\/div>\n\n\n<style>.kadence-column521022_d91ef4-93 > .kt-inside-inner-col,.kadence-column521022_d91ef4-93 > .kt-inside-inner-col:before{border-top-left-radius:0px;border-top-right-radius:0px;border-bottom-right-radius:0px;border-bottom-left-radius:0px;}.kadence-column521022_d91ef4-93 > .kt-inside-inner-col{column-gap:var(--global-kb-gap-sm, 1rem);}.kadence-column521022_d91ef4-93 > .kt-inside-inner-col{flex-direction:column;}.kadence-column521022_d91ef4-93 > .kt-inside-inner-col > .aligncenter{width:100%;}.kadence-column521022_d91ef4-93 > .kt-inside-inner-col:before{opacity:0.3;}.kadence-column521022_d91ef4-93{position:relative;}@media all and (max-width: 1024px){.kadence-column521022_d91ef4-93 > .kt-inside-inner-col{flex-direction:column;justify-content:center;}}@media all and (max-width: 767px){.kadence-column521022_d91ef4-93 > .kt-inside-inner-col{flex-direction:column;justify-content:center;}}<\/style>\n<div class=\"wp-block-kadence-column kadence-column521022_d91ef4-93\"><div class=\"kt-inside-inner-col\"><\/div><\/div>\n\n\n<style>.kadence-column521022_ef3bc7-cd > .kt-inside-inner-col,.kadence-column521022_ef3bc7-cd > .kt-inside-inner-col:before{border-top-left-radius:0px;border-top-right-radius:0px;border-bottom-right-radius:0px;border-bottom-left-radius:0px;}.kadence-column521022_ef3bc7-cd > .kt-inside-inner-col{column-gap:var(--global-kb-gap-sm, 1rem);}.kadence-column521022_ef3bc7-cd > .kt-inside-inner-col{flex-direction:column;}.kadence-column521022_ef3bc7-cd > .kt-inside-inner-col > .aligncenter{width:100%;}.kadence-column521022_ef3bc7-cd > .kt-inside-inner-col:before{opacity:0.3;}.kadence-column521022_ef3bc7-cd{position:relative;}@media all and (max-width: 1024px){.kadence-column521022_ef3bc7-cd > .kt-inside-inner-col{flex-direction:column;justify-content:center;}}@media all and (max-width: 767px){.kadence-column521022_ef3bc7-cd > .kt-inside-inner-col{flex-direction:column;justify-content:center;}}<\/style>\n<div class=\"wp-block-kadence-column kadence-column521022_ef3bc7-cd\"><div class=\"kt-inside-inner-col\">\n<figure class=\"wp-block-embed is-type-video is-provider-youtube wp-block-embed-youtube wp-embed-aspect-16-9 wp-has-aspect-ratio\"><div class=\"wp-block-embed__wrapper\">\n<iframe loading=\"lazy\" title=\"Unlock your NPUs -- with FastFlowLM\" width=\"720\" height=\"405\" src=\"https:\/\/www.youtube.com\/embed\/lr_DoG39BVI?list=PL7ub8aDdkaOUv08PWbGgI1XzqpOYBMhaF\" frameborder=\"0\" allow=\"accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share\" referrerpolicy=\"strict-origin-when-cross-origin\" allowfullscreen><\/iframe>\n<\/div><figcaption class=\"wp-element-caption\"><a href=\"https:\/\/www.youtube.com\/watch?v=lr_DoG39BVI&amp;list=PL7ub8aDdkaOUv08PWbGgI1XzqpOYBMhaF\">Watch Video in Playlist<\/a><\/figcaption><\/figure>\n<\/div><\/div>\n\n<\/div><\/div>\n\n\n<h3 class=\"wp-block-heading\">The Takeaway<\/h3>\n\n\n\n<p class=\"wp-block-paragraph\">NPUs represent a paradigm shift for local AI, but hardware is only as good as the software stack running on it. If you are stuck using legacy, GPU-dependent tools for your Ryzen AI silicon, you are leaving massive performance gains on the table. Moving to a native engine like FLM transforms your hardware from an underutilized component into a highly responsive, ultra-fast localized AI powerhouse.<\/p>\n\n\n\n<p class=\"wp-block-paragraph\"><\/p>\n\n\n\n<p class=\"wp-block-paragraph\">Update: AS I researched and read more about these guys I noticed that FLM is now embedded into <a href=\"https:\/\/lemonade-server.ai\/\">Lemonade: Local AI <\/a><\/p>\n\n\n\n<p class=\"wp-block-paragraph\"><\/p>\n\n\n<style>.wp-block-kadence-advancedheading.kt-adv-heading407818_afcbba-c7, .wp-block-kadence-advancedheading.kt-adv-heading407818_afcbba-c7[data-kb-block=\"kb-adv-heading407818_afcbba-c7\"]{font-size:var(--global-kb-font-size-sm, 0.9rem);font-style:normal;}.wp-block-kadence-advancedheading.kt-adv-heading407818_afcbba-c7 mark.kt-highlight, .wp-block-kadence-advancedheading.kt-adv-heading407818_afcbba-c7[data-kb-block=\"kb-adv-heading407818_afcbba-c7\"] mark.kt-highlight{font-style:normal;color:#f76a0c;-webkit-box-decoration-break:clone;box-decoration-break:clone;padding-top:0px;padding-right:0px;padding-bottom:0px;padding-left:0px;}.wp-block-kadence-advancedheading.kt-adv-heading407818_afcbba-c7 img.kb-inline-image, .wp-block-kadence-advancedheading.kt-adv-heading407818_afcbba-c7[data-kb-block=\"kb-adv-heading407818_afcbba-c7\"] img.kb-inline-image{width:150px;vertical-align:baseline;}<\/style>\n<p class=\"kt-adv-heading407818_afcbba-c7 wp-block-kadence-advancedheading\" data-kb-block=\"kb-adv-heading407818_afcbba-c7\">Have questions, ideas to share, or just want to connect? I\u2019d love to hear from you! Check out my <a href=\"https:\/\/jorgep.com\/blog\/about\/\">About Page<\/a> to learn more about me or connect with me.<\/p>\n\n\n\n<p class=\"wp-block-paragraph\"><\/p>\n\n\n\n<p class=\"wp-block-paragraph\"><\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li><a href=\"https:\/\/fastflowlm.com\/\">FastFlowLM \u00b7 FastFlowLM<\/a><\/li>\n\n\n\n<li><\/li>\n\n\n\n<li><strong><a href=\"https:\/\/jorgep.com\/blog\/category\/ai\/\" target=\"_blank\" rel=\"noreferrer noopener\">Jorge&#8217;s Local AI Series<\/a><\/strong>  |  <strong><a href=\"https:\/\/jorgep.com\/blog\/book-series-ai-dont-just-chat\/\" target=\"_blank\" rel=\"noreferrer noopener\">AI Learnings Series<\/a><\/strong> <\/li>\n\n\n\n<li><strong><a href=\"https:\/\/ai.google.dev\/gemma\" target=\"_blank\" rel=\"noreferrer noopener\">Google Gemma 4 Technical Report<\/a><\/strong><\/li>\n\n\n\n<li><strong><a href=\"https:\/\/developer.nvidia.com\/nemotron\" target=\"_blank\" rel=\"noreferrer noopener\">NVIDIA Nemotron Nano 3 Omni Documentation<\/a><\/strong><\/li>\n\n\n\n<li><\/li>\n<\/ul>\n\n\n\n<h2 class=\"wp-block-heading\">Additional Analysis <\/h2>\n\n\n\n<p class=\"wp-block-paragraph\">After I ran the benchmark command: <\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>flm bench llama3.2:1b<\/code><\/pre>\n\n\n\n<p class=\"wp-block-paragraph\">I received: <\/p>\n\n\n\n<p class=\"wp-block-paragraph\"><\/p>\n\n\n\n<figure class=\"wp-block-image size-full\"><img loading=\"lazy\" decoding=\"async\" width=\"926\" height=\"228\" src=\"https:\/\/jorgep.com\/blog\/wp-content\/uploads\/image-168.png\" alt=\"\" class=\"wp-image-521053\" srcset=\"https:\/\/jorgep.com\/blog\/wp-content\/uploads\/image-168.png 926w, https:\/\/jorgep.com\/blog\/wp-content\/uploads\/image-168-300x74.png 300w, https:\/\/jorgep.com\/blog\/wp-content\/uploads\/image-168-768x189.png 768w\" sizes=\"auto, (max-width: 926px) 100vw, 926px\" \/><\/figure>\n\n\n\n<p class=\"wp-block-paragraph\"><\/p>\n\n\n\n<h3 class=\"wp-block-heading\">Understanding the Terminology<\/h3>\n\n\n\n<ul class=\"wp-block-list\">\n<li><strong>Context Length:<\/strong> This refers to the amount of text (measured in tokens, where 1,000 tokens \u2248 750 words) that the AI can &#8220;remember&#8221; or process at one time. Higher is better, but it is more demanding on hardware.<\/li>\n\n\n\n<li><strong>TTFT (Time to First Token):<\/strong> This is the latency, or the delay, before the AI starts generating the first word of your answer. <strong>Lower is better.<\/strong> You want this to be fast so the AI feels responsive.<\/li>\n\n\n\n<li><strong>Prefill Speed:<\/strong> This measures how quickly the AI &#8220;reads&#8221; or understands the input you provided. <strong>Higher is better.<\/strong><\/li>\n\n\n\n<li><strong>Decoding Speed:<\/strong> This is the rate at which the AI generates the text you see on the screen. <strong>Higher is better.<\/strong> Generally, speeds above 30\u201340 tokens per second (tok\/s) are considered excellent, as they are faster than a human can read.<\/li>\n<\/ul>\n\n\n\n<h3 class=\"wp-block-heading\">Analysis of the Results<\/h3>\n\n\n\n<p class=\"wp-block-paragraph\">Looking at the data in <strong>image_bfe53d.png<\/strong>:<\/p>\n\n\n\n<figure class=\"wp-block-table\"><table><thead><tr><td><strong>Context Length<\/strong><\/td><td><strong>TTFT (s)<\/strong><\/td><td><strong>Decoding Speed (tok\/s)<\/strong><\/td><\/tr><\/thead><tbody><tr><td><strong>1k<\/strong><\/td><td>0.75s<\/td><td>~53 tok\/s<\/td><\/tr><tr><td><strong>8k<\/strong><\/td><td>4.16s<\/td><td>~41 tok\/s<\/td><\/tr><tr><td><strong>32k<\/strong><\/td><td>29.15s<\/td><td>~23 tok\/s<\/td><\/tr><\/tbody><\/table><\/figure>\n\n\n\n<ul class=\"wp-block-list\">\n<li><strong>For short conversations (1k\u20134k context):<\/strong> The performance is <strong>very good<\/strong>. A TTFT of under 2 seconds and a decoding speed above 45 tok\/s will provide a smooth, snappy experience for daily tasks like drafting emails or simple queries.<\/li>\n\n\n\n<li><strong>For long documents\/complex tasks (16k\u201332k context):<\/strong> The performance drops significantly. A 29-second wait for the first response (TTFT) is quite long, and a decoding speed of 23 tok\/s is slower, though still readable.<\/li>\n<\/ul>\n\n\n\n<h3 class=\"wp-block-heading\">Is this &#8220;Good&#8221; for an AI PC?<\/h3>\n\n\n\n<p class=\"wp-block-paragraph\">Yes, this is <strong>solid performance for a local AI PC<\/strong>.<\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li><strong>Strengths:<\/strong> At typical conversation lengths (up to 8k tokens), this system performs very well and would feel fast to a user.<\/li>\n\n\n\n<li><strong>Limitations:<\/strong> The system struggles as you push toward very long context lengths (32k), likely due to the hardware&#8217;s memory (VRAM) limits.<\/li>\n<\/ul>\n\n\n\n<p class=\"wp-block-paragraph\">If your primary goal is to run a local chatbot for day-to-day writing and analysis, this hardware is doing a great job. If you intend to use it for analyzing massive technical manuals or extremely long documents (32k+ tokens), you might notice it becoming sluggish.<\/p>\n\n\n\n<p class=\"wp-block-paragraph\"><\/p>\n\n\n\n<p class=\"wp-block-paragraph\">It really depends on the model you use!   look at another ones I used: <\/p>\n\n\n\n<h3 class=\"wp-block-heading\">And this one using  gemma4<\/h3>\n\n\n\n<pre class=\"wp-block-code\"><code>flm bench gemma4-it:e4b<\/code><\/pre>\n\n\n\n<p class=\"wp-block-paragraph\"><\/p>\n\n\n\n<figure class=\"wp-block-image size-full\"><img loading=\"lazy\" decoding=\"async\" width=\"926\" height=\"225\" src=\"https:\/\/jorgep.com\/blog\/wp-content\/uploads\/image-169.png\" alt=\"\" class=\"wp-image-521055\" srcset=\"https:\/\/jorgep.com\/blog\/wp-content\/uploads\/image-169.png 926w, https:\/\/jorgep.com\/blog\/wp-content\/uploads\/image-169-300x73.png 300w, https:\/\/jorgep.com\/blog\/wp-content\/uploads\/image-169-768x187.png 768w\" sizes=\"auto, (max-width: 926px) 100vw, 926px\" \/><\/figure>\n\n\n\n<h4 class=\"wp-block-heading\">Summary of Performance<\/h4>\n\n\n\n<p class=\"wp-block-paragraph\">This data confirms that your local-first AI infrastructure is highly optimized for stability rather than just raw &#8220;burst&#8221; speed.<\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li><strong>Responsiveness:<\/strong> For typical interactions (1k to 4k), your setup is fast enough for fluid, real-time conversation.<\/li>\n\n\n\n<li><strong>Long-Context Capacity:<\/strong> By maintaining a decoding speed of ~6.5 tok\/s at 32k, your hardware is successfully avoiding the &#8220;total stall&#8221; that often happens when local systems run out of memory or compute bandwidth.<\/li>\n<\/ul>\n\n\n\n<p class=\"wp-block-paragraph\">Overall, this is a very capable profile for a local AI workstation, especially if your workflows involve document analysis or long-form creative tasks where consistent generation is more important than immediate, sub-second responses.<\/p>\n\n\n\n<p class=\"wp-block-paragraph\"><\/p>\n\n\n\n<p class=\"wp-block-paragraph\"><\/p>\n\n\n\n<h2 class=\"wp-block-heading\">One last thing:  Hybrid  Model  GPU and NPU<\/h2>\n\n\n\n<p class=\"wp-block-paragraph\"><\/p>\n\n\n\n<p class=\"wp-block-paragraph\">I will write more about this in the future, but based on the design of the FastFlowLM (FLM) runtime, it is currently not possible to configure it to use both the NPU and the GPU simultaneously for the same model inference task.<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">FastFlowLM is intentionally built as an <strong>NPU-first<\/strong> runtime. The development team has focused on optimizing the inference pipeline specifically for the unique architecture of AMD\u2019s Ryzen AI NPUs (the XDNA\/XDNA2 tile-based architecture).<\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li><strong>Sequential Bottleneck:<\/strong> LLM inference is a sequential process where each layer of the model must finish calculating before the next layer can begin. Because of this, trying to split the workload across two completely different types of compute units (like an NPU and a GPU) would introduce massive synchronization overhead, likely making the system significantly slower than using just one of them.<\/li>\n\n\n\n<li><strong>Specialized Kernels:<\/strong> FLM uses custom, highly optimized kernels specifically written to communicate with the NPU hardware. These are not designed to offload sub-tasks to the GPU.<\/li>\n<\/ul>\n\n\n\n<h3 class=\"wp-block-heading\">Why You Can&#8217;t Simply &#8220;Turn On&#8221; Both<\/h3>\n\n\n\n<p class=\"wp-block-paragraph\">As discussed, the <strong>FastFlowLM (FLM)<\/strong> runtime is specifically architected to interact with the unique hardware of your NPU. Integrating both is not a simple settings toggle because of how LLM processing works:<\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li><strong>Sequential Bottlenecks:<\/strong> LLMs generate text token by token. Once the model is &#8220;prefilled&#8221; (it has read your prompt), the generation process is mostly limited by memory bandwidth. Switching the task between two different architectures (NPU and GPU) requires moving data across the system bus, which often introduces more delay than the speed gained by the parallel hardware.<\/li>\n\n\n\n<li><strong>Architectural Differences:<\/strong> NPUs and GPUs use different data types and memory access patterns. Making them work in &#8220;lock-step&#8221; requires complex software orchestration that is currently not supported in standard end-user tools like FLM.<\/li>\n<\/ul>\n\n\n\n<h3 class=\"wp-block-heading\">The &#8220;Hybrid&#8221; Future<\/h3>\n\n\n\n<p class=\"wp-block-paragraph\">The industry is moving toward what is called <strong>Disaggregated Inference<\/strong>. In this setup, systems are designed to intelligently split the workload:<\/p>\n\n\n\n<ol start=\"1\" class=\"wp-block-list\">\n<li><strong>NPU handles Prefill:<\/strong> It is often excellent at processing the input text (prefill) quickly and efficiently.<\/li>\n\n\n\n<li><strong>GPU handles Decode:<\/strong> It excels at the &#8220;decoding&#8221; loop\u2014generating one token at a time\u2014due to its high memory bandwidth.<\/li>\n<\/ol>\n\n\n\n<p class=\"wp-block-paragraph\">More on this in a future blog post for sure! <\/p>\n\n\n<style>.wp-block-kadence-advancedheading.kt-adv-heading519075_3e93d8-da, .wp-block-kadence-advancedheading.kt-adv-heading519075_3e93d8-da[data-kb-block=\"kb-adv-heading519075_3e93d8-da\"]{text-align:center;font-size:var(--global-kb-font-size-md, 1.25rem);line-height:60px;font-style:normal;background-color:#f5a511;}.wp-block-kadence-advancedheading.kt-adv-heading519075_3e93d8-da mark.kt-highlight, .wp-block-kadence-advancedheading.kt-adv-heading519075_3e93d8-da[data-kb-block=\"kb-adv-heading519075_3e93d8-da\"] mark.kt-highlight{font-style:normal;color:#f76a0c;-webkit-box-decoration-break:clone;box-decoration-break:clone;padding-top:0px;padding-right:0px;padding-bottom:0px;padding-left:0px;}.wp-block-kadence-advancedheading.kt-adv-heading519075_3e93d8-da img.kb-inline-image, .wp-block-kadence-advancedheading.kt-adv-heading519075_3e93d8-da[data-kb-block=\"kb-adv-heading519075_3e93d8-da\"] img.kb-inline-image{width:150px;vertical-align:baseline;}<\/style>\n<p class=\"kt-adv-heading519075_3e93d8-da wp-block-kadence-advancedheading\" data-kb-block=\"kb-adv-heading519075_3e93d8-da\">Want More Updates? =>Subscribe to my <a href=\"https:\/\/go.35s.be\/jtb\" target=\"_blank\" rel=\"noreferrer noopener\"><strong>JorgeTechBits  newsletter<\/strong><\/a><\/p>\n\n\n\n<p class=\"wp-block-paragraph\"><\/p>\n","protected":false},"excerpt":{"rendered":"<p>How I Bypassed Ollama and LM studio Limitations on my Ryzen AI NPU to Hit 50+ TPS If you recently purchased a modern AI-PC, you bought into a promising vision: a dedicated, cutting-edge Neural Processing Unit (NPU) sitting right inside your silicon, designed to stream large language models (LLMs) smoothly without draining your battery or&#8230;<\/p>\n","protected":false},"author":2,"featured_media":521050,"comment_status":"closed","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"_kad_blocks_custom_css":"","_kad_blocks_head_custom_js":"","_kad_blocks_body_custom_js":"","_kad_blocks_footer_custom_js":"","ngg_post_thumbnail":0,"episode_type":"","audio_file":"","podmotor_file_id":"","podmotor_episode_id":"","cover_image":"","cover_image_id":"","duration":"","filesize":"","filesize_raw":"","date_recorded":"","explicit":"","block":"","itunes_episode_number":"","itunes_title":"","itunes_season_number":"","itunes_episode_type":"","_kad_post_transparent":"","_kad_post_title":"","_kad_post_layout":"","_kad_post_sidebar_id":"","_kad_post_content_style":"","_kad_post_vertical_padding":"","_kad_post_feature":"","_kad_post_feature_position":"","_kad_post_header":false,"_kad_post_footer":false,"_kad_post_classname":"","footnotes":""},"categories":[1031,441],"tags":[930,1060,894,1053,876,986,1061],"class_list":["post-521022","post","type-post","status-publish","format-standard","has-post-thumbnail","hentry","category-ai-learnings-series","category-tech-talk","tag-ai-series","tag-amd","tag-artificial-intelligence","tag-gemma4","tag-llm","tag-local-ai","tag-nvidia"],"taxonomy_info":{"category":[{"value":1031,"label":"AI Learnings Series"},{"value":441,"label":"Tech Talk"}],"post_tag":[{"value":930,"label":"AI Series"},{"value":1060,"label":"AMD"},{"value":894,"label":"artificial intelligence"},{"value":1053,"label":"gemma4"},{"value":876,"label":"LLM"},{"value":986,"label":"Local AI"},{"value":1061,"label":"NVIDIA"}]},"featured_image_src_large":["https:\/\/jorgep.com\/blog\/wp-content\/uploads\/jorgep-FeaturedImage-LocalAI-NPUUnlocked.jpg",800,300,false],"author_info":{"display_name":"Jorge Pereira","author_link":"https:\/\/jorgep.com\/blog\/author\/jorge\/"},"comment_info":0,"category_info":[{"term_id":1031,"name":"AI Learnings Series","slug":"ai-learnings-series","term_group":0,"term_taxonomy_id":1041,"taxonomy":"category","description":"","parent":0,"count":49,"filter":"raw","cat_ID":1031,"category_count":49,"category_description":"","cat_name":"AI Learnings Series","category_nicename":"ai-learnings-series","category_parent":0},{"term_id":441,"name":"Tech Talk","slug":"tech-talk","term_group":0,"term_taxonomy_id":451,"taxonomy":"category","description":"","parent":0,"count":741,"filter":"raw","cat_ID":441,"category_count":741,"category_description":"","cat_name":"Tech Talk","category_nicename":"tech-talk","category_parent":0}],"tag_info":[{"term_id":930,"name":"AI Series","slug":"ai-series","term_group":0,"term_taxonomy_id":940,"taxonomy":"post_tag","description":"","parent":0,"count":228,"filter":"raw"},{"term_id":1060,"name":"AMD","slug":"amd","term_group":0,"term_taxonomy_id":1070,"taxonomy":"post_tag","description":"","parent":0,"count":19,"filter":"raw"},{"term_id":894,"name":"artificial intelligence","slug":"artificial-intelligence","term_group":0,"term_taxonomy_id":904,"taxonomy":"post_tag","description":"","parent":0,"count":201,"filter":"raw"},{"term_id":1053,"name":"gemma4","slug":"gemma4","term_group":0,"term_taxonomy_id":1063,"taxonomy":"post_tag","description":"","parent":0,"count":7,"filter":"raw"},{"term_id":876,"name":"LLM","slug":"llm","term_group":0,"term_taxonomy_id":886,"taxonomy":"post_tag","description":"","parent":0,"count":25,"filter":"raw"},{"term_id":986,"name":"Local AI","slug":"local-ai","term_group":0,"term_taxonomy_id":996,"taxonomy":"post_tag","description":"","parent":0,"count":60,"filter":"raw"},{"term_id":1061,"name":"NVIDIA","slug":"nvidia","term_group":0,"term_taxonomy_id":1071,"taxonomy":"post_tag","description":"","parent":0,"count":35,"filter":"raw"}],"_links":{"self":[{"href":"https:\/\/jorgep.com\/blog\/wp-json\/wp\/v2\/posts\/521022","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/jorgep.com\/blog\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/jorgep.com\/blog\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/jorgep.com\/blog\/wp-json\/wp\/v2\/users\/2"}],"replies":[{"embeddable":true,"href":"https:\/\/jorgep.com\/blog\/wp-json\/wp\/v2\/comments?post=521022"}],"version-history":[{"count":18,"href":"https:\/\/jorgep.com\/blog\/wp-json\/wp\/v2\/posts\/521022\/revisions"}],"predecessor-version":[{"id":521147,"href":"https:\/\/jorgep.com\/blog\/wp-json\/wp\/v2\/posts\/521022\/revisions\/521147"}],"wp:featuredmedia":[{"embeddable":true,"href":"https:\/\/jorgep.com\/blog\/wp-json\/wp\/v2\/media\/521050"}],"wp:attachment":[{"href":"https:\/\/jorgep.com\/blog\/wp-json\/wp\/v2\/media?parent=521022"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/jorgep.com\/blog\/wp-json\/wp\/v2\/categories?post=521022"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/jorgep.com\/blog\/wp-json\/wp\/v2\/tags?post=521022"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}