 {"id":521022,"date":"2026-05-24T00:22:24","date_gmt":"2026-05-24T07:22:24","guid":{"rendered":"https:\/\/jorgep.com\/blog\/?p=521022"},"modified":"2026-06-08T10:14:42","modified_gmt":"2026-06-08T17:14:42","slug":"unlocking-the-npu-fastflowlm","status":"publish","type":"post","link":"https:\/\/jorgep.com\/blog\/unlocking-the-npu-fastflowlm\/","title":{"rendered":"Unlocking the NPU: FastFlowLM"},"content":{"rendered":"\n<div class=\"wp-block-columns has-theme-palette-7-background-color has-background is-layout-flex wp-container-core-columns-is-layout-5dc627e1 wp-block-columns-is-layout-flex\" style=\"margin-top:0;margin-bottom:0\">\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\" style=\"flex-basis:80%\">\n<p class=\"wp-block-paragraph\">Part of: <strong> <a href=\"https:\/\/jorgep.com\/blog\/series-ai-learnings\/\">AI Learning Series Here<\/a><\/strong><\/p>\n\n\n<style>.kadence-column395113_97b87a-23 > .kt-inside-inner-col,.kadence-column395113_97b87a-23 > .kt-inside-inner-col:before{border-top-left-radius:0px;border-top-right-radius:0px;border-bottom-right-radius:0px;border-bottom-left-radius:0px;}.kadence-column395113_97b87a-23 > .kt-inside-inner-col{column-gap:var(--global-kb-gap-sm, 1rem);}.kadence-column395113_97b87a-23 > .kt-inside-inner-col{flex-direction:column;}.kadence-column395113_97b87a-23 > .kt-inside-inner-col > .aligncenter{width:100%;}.kadence-column395113_97b87a-23 > .kt-inside-inner-col:before{opacity:0.3;}.kadence-column395113_97b87a-23{position:relative;}@media all and (max-width: 1024px){.kadence-column395113_97b87a-23 > .kt-inside-inner-col{flex-direction:column;justify-content:center;}}@media all and (max-width: 767px){.kadence-column395113_97b87a-23 > .kt-inside-inner-col{flex-direction:column;justify-content:center;}}<\/style>\n<div class=\"wp-block-kadence-column kadence-column395113_97b87a-23\"><div class=\"kt-inside-inner-col\"><style>.kadence-column510545_44b637-b5 > .kt-inside-inner-col{padding-top:var(--global-kb-spacing-xs, 1rem);padding-bottom:var(--global-kb-spacing-xs, 1rem);}.kadence-column510545_44b637-b5 > .kt-inside-inner-col,.kadence-column510545_44b637-b5 > .kt-inside-inner-col:before{border-top-left-radius:0px;border-top-right-radius:0px;border-bottom-right-radius:0px;border-bottom-left-radius:0px;}.kadence-column510545_44b637-b5 > .kt-inside-inner-col{column-gap:var(--global-kb-gap-sm, 1rem);}.kadence-column510545_44b637-b5 > .kt-inside-inner-col{flex-direction:column;}.kadence-column510545_44b637-b5 > .kt-inside-inner-col > .aligncenter{width:100%;}.kadence-column510545_44b637-b5 > .kt-inside-inner-col{background-color:var(--global-palette7, #EDF2F7);}.kadence-column510545_44b637-b5:hover > .kt-inside-inner-col{background-color:var(--global-palette8, #F7FAFC);background-image:none;}.kadence-column510545_44b637-b5 > .kt-inside-inner-col:before{opacity:0.3;}.kadence-column510545_44b637-b5{position:relative;}@media all and (max-width: 1024px){.kadence-column510545_44b637-b5 > .kt-inside-inner-col{flex-direction:column;justify-content:center;}}@media all and (max-width: 767px){.kadence-column510545_44b637-b5 > .kt-inside-inner-col{flex-direction:column;justify-content:center;}}<\/style>\n<div class=\"wp-block-kadence-column kadence-column510545_44b637-b5\"><div class=\"kt-inside-inner-col\"><style>.wp-block-kadence-advancedheading.kt-adv-heading510545_f7c5f8-ed, .wp-block-kadence-advancedheading.kt-adv-heading510545_f7c5f8-ed[data-kb-block=\"kb-adv-heading510545_f7c5f8-ed\"]{text-align:center;font-size:var(--global-kb-font-size-md, 1.25rem);font-style:normal;}.wp-block-kadence-advancedheading.kt-adv-heading510545_f7c5f8-ed mark.kt-highlight, .wp-block-kadence-advancedheading.kt-adv-heading510545_f7c5f8-ed[data-kb-block=\"kb-adv-heading510545_f7c5f8-ed\"] mark.kt-highlight{font-style:normal;color:#f76a0c;-webkit-box-decoration-break:clone;box-decoration-break:clone;padding-top:0px;padding-right:0px;padding-bottom:0px;padding-left:0px;}.wp-block-kadence-advancedheading.kt-adv-heading510545_f7c5f8-ed img.kb-inline-image, .wp-block-kadence-advancedheading.kt-adv-heading510545_f7c5f8-ed[data-kb-block=\"kb-adv-heading510545_f7c5f8-ed\"] img.kb-inline-image{width:150px;vertical-align:baseline;}<\/style>\n<p class=\"kt-adv-heading510545_f7c5f8-ed wp-block-kadence-advancedheading\" data-kb-block=\"kb-adv-heading510545_f7c5f8-ed\">Quick Links:&nbsp;<a href=\"https:\/\/jorgep.com\/blog\/resources-for-learning-ai\/\">Resources for Learning AI<\/a> | <a href=\"https:\/\/jorgep.com\/blog\/keeping-up-with-ai\/\">Keep up with AI<\/a> | <a href=\"https:\/\/jorgep.com\/blog\/list-of-ai-tools\/\" data-type=\"post\" data-id=\"402818\">List of AI Tools<\/a> | <a href=\"https:\/\/jorgep.com\/blog\/local-ai-series\/\" data-type=\"page\" data-id=\"519365\">Local AI<\/a> | <a href=\"https:\/\/jorgep.com\/blog\/tag\/ai-agents\/\" data-type=\"post_tag\" data-id=\"941\">AI Agents<\/a> |  <a href=\"https:\/\/jorgep.com\/blog\/work-beyond-tomorrow-series\/\" data-type=\"page\" data-id=\"365001\">Future of Work<\/a><\/p>\n<\/div><\/div>\n<\/div><\/div>\n\n\n<style>.kb-row-layout-id395113_d73e95-0d > .kt-row-column-wrap{align-content:start;}:where(.kb-row-layout-id395113_d73e95-0d > .kt-row-column-wrap) > .wp-block-kadence-column{justify-content:start;}.kb-row-layout-id395113_d73e95-0d > .kt-row-column-wrap{column-gap:var(--global-kb-gap-md, 2rem);row-gap:var(--global-kb-gap-none, 0rem );padding-top:var(--global-kb-spacing-xxs, 0.5rem);padding-bottom:var(--global-kb-spacing-xxs, 0.5rem);grid-template-columns:repeat(2, minmax(0, 1fr));}.kb-row-layout-id395113_d73e95-0d > .kt-row-layout-overlay{opacity:0.30;}@media all and (max-width: 1024px){.kb-row-layout-id395113_d73e95-0d > .kt-row-column-wrap{grid-template-columns:repeat(2, minmax(0, 1fr));}}@media all and (max-width: 767px){.kb-row-layout-id395113_d73e95-0d > .kt-row-column-wrap{grid-template-columns:minmax(0, 1fr);}}<\/style><div class=\"kb-row-layout-wrap kb-row-layout-id395113_d73e95-0d alignnone wp-block-kadence-rowlayout\"><div class=\"kt-row-column-wrap kt-has-2-columns kt-row-layout-equal kt-tab-layout-inherit kt-mobile-layout-row kt-row-valign-top\">\n<style>.kadence-column395113_df36f9-de > .kt-inside-inner-col,.kadence-column395113_df36f9-de > .kt-inside-inner-col:before{border-top-left-radius:0px;border-top-right-radius:0px;border-bottom-right-radius:0px;border-bottom-left-radius:0px;}.kadence-column395113_df36f9-de > .kt-inside-inner-col{column-gap:var(--global-kb-gap-sm, 1rem);}.kadence-column395113_df36f9-de > .kt-inside-inner-col{flex-direction:column;}.kadence-column395113_df36f9-de > .kt-inside-inner-col > .aligncenter{width:100%;}.kadence-column395113_df36f9-de > .kt-inside-inner-col:before{opacity:0.3;}.kadence-column395113_df36f9-de{position:relative;}@media all and (max-width: 1024px){.kadence-column395113_df36f9-de > .kt-inside-inner-col{flex-direction:column;justify-content:center;}}@media all and (max-width: 767px){.kadence-column395113_df36f9-de > .kt-inside-inner-col{flex-direction:column;justify-content:center;}}<\/style>\n<div class=\"wp-block-kadence-column kadence-column395113_df36f9-de\"><div class=\"kt-inside-inner-col\"><style>.wp-block-kadence-advancedheading.kt-adv-heading395113_b3212c-b9, .wp-block-kadence-advancedheading.kt-adv-heading395113_b3212c-b9[data-kb-block=\"kb-adv-heading395113_b3212c-b9\"]{text-align:center;font-size:var(--global-kb-font-size-sm, 0.9rem);line-height:60px;font-style:normal;background-color:#f5a511;}.wp-block-kadence-advancedheading.kt-adv-heading395113_b3212c-b9 mark.kt-highlight, .wp-block-kadence-advancedheading.kt-adv-heading395113_b3212c-b9[data-kb-block=\"kb-adv-heading395113_b3212c-b9\"] mark.kt-highlight{font-style:normal;color:#f76a0c;-webkit-box-decoration-break:clone;box-decoration-break:clone;padding-top:0px;padding-right:0px;padding-bottom:0px;padding-left:0px;}.wp-block-kadence-advancedheading.kt-adv-heading395113_b3212c-b9 img.kb-inline-image, .wp-block-kadence-advancedheading.kt-adv-heading395113_b3212c-b9[data-kb-block=\"kb-adv-heading395113_b3212c-b9\"] img.kb-inline-image{width:150px;vertical-align:baseline;}<\/style>\n<p class=\"kt-adv-heading395113_b3212c-b9 wp-block-kadence-advancedheading\" data-kb-block=\"kb-adv-heading395113_b3212c-b9\">Subscribe to <a href=\"https:\/\/go.35s.be\/jtb\" target=\"_blank\" rel=\"noreferrer noopener\"><strong>JorgeTechBits  newsletter<\/strong><\/a><\/p>\n<\/div><\/div>\n\n\n<style>.kadence-column395113_4b4b81-29 > .kt-inside-inner-col,.kadence-column395113_4b4b81-29 > .kt-inside-inner-col:before{border-top-left-radius:0px;border-top-right-radius:0px;border-bottom-right-radius:0px;border-bottom-left-radius:0px;}.kadence-column395113_4b4b81-29 > .kt-inside-inner-col{column-gap:var(--global-kb-gap-sm, 1rem);}.kadence-column395113_4b4b81-29 > .kt-inside-inner-col{flex-direction:column;}.kadence-column395113_4b4b81-29 > .kt-inside-inner-col > .aligncenter{width:100%;}.kadence-column395113_4b4b81-29 > .kt-inside-inner-col:before{opacity:0.3;}.kadence-column395113_4b4b81-29{position:relative;}@media all and (max-width: 1024px){.kadence-column395113_4b4b81-29 > .kt-inside-inner-col{flex-direction:column;justify-content:center;}}@media all and (max-width: 767px){.kadence-column395113_4b4b81-29 > .kt-inside-inner-col{flex-direction:column;justify-content:center;}}<\/style>\n<div class=\"wp-block-kadence-column kadence-column395113_4b4b81-29\"><div class=\"kt-inside-inner-col\"><\/div><\/div>\n\n<\/div><\/div><\/div>\n\n\n\n<div class=\"wp-block-column is-layout-flow wp-block-column-is-layout-flow\"><div class=\"wp-block-image\">\n<figure class=\"aligncenter size-large is-resized\"><a href=\"htthttps:\/\/jorgep.com\/blog\/book-dont-just-chat-delegate\/\"><img loading=\"lazy\" decoding=\"async\" width=\"640\" height=\"1024\" src=\"https:\/\/jorgep.com\/blog\/wp-content\/uploads\/CoverBook-01-640x1024.jpg\" alt=\"\" class=\"wp-image-520234\" style=\"aspect-ratio:0.6250142320391666;width:98px;height:auto\" srcset=\"https:\/\/jorgep.com\/blog\/wp-content\/uploads\/CoverBook-01-640x1024.jpg 640w, https:\/\/jorgep.com\/blog\/wp-content\/uploads\/CoverBook-01-188x300.jpg 188w, https:\/\/jorgep.com\/blog\/wp-content\/uploads\/CoverBook-01-768x1229.jpg 768w, https:\/\/jorgep.com\/blog\/wp-content\/uploads\/CoverBook-01-960x1536.jpg 960w, https:\/\/jorgep.com\/blog\/wp-content\/uploads\/CoverBook-01-1280x2048.jpg 1280w, https:\/\/jorgep.com\/blog\/wp-content\/uploads\/CoverBook-01.jpg 1600w\" sizes=\"auto, (max-width: 640px) 100vw, 640px\" \/><\/a><figcaption class=\"wp-element-caption\"><a href=\"https:\/\/jorgep.com\/blog\/book-series-ai-dont-just-chat\/\" data-type=\"page\" data-id=\"520242\">Check out the Book Series<\/a><\/figcaption><\/figure>\n<\/div><\/div>\n<\/div>\n\n\n<style>.wp-block-kadence-advancedheading.kt-adv-heading519190_b33a00-c9, .wp-block-kadence-advancedheading.kt-adv-heading519190_b33a00-c9[data-kb-block=\"kb-adv-heading519190_b33a00-c9\"]{font-size:var(--global-kb-font-size-sm, 0.9rem);font-style:normal;}.wp-block-kadence-advancedheading.kt-adv-heading519190_b33a00-c9 mark.kt-highlight, .wp-block-kadence-advancedheading.kt-adv-heading519190_b33a00-c9[data-kb-block=\"kb-adv-heading519190_b33a00-c9\"] mark.kt-highlight{font-style:normal;color:#f76a0c;-webkit-box-decoration-break:clone;box-decoration-break:clone;padding-top:0px;padding-right:0px;padding-bottom:0px;padding-left:0px;}.wp-block-kadence-advancedheading.kt-adv-heading519190_b33a00-c9 img.kb-inline-image, .wp-block-kadence-advancedheading.kt-adv-heading519190_b33a00-c9[data-kb-block=\"kb-adv-heading519190_b33a00-c9\"] img.kb-inline-image{width:150px;vertical-align:baseline;}<\/style>\n<p class=\"kt-adv-heading519190_b33a00-c9 wp-block-kadence-advancedheading\" data-kb-block=\"kb-adv-heading519190_b33a00-c9\"><strong>Disclaimer:<\/strong> <strong>I create this content entirely on my own time, and the views expressed here are mine alone (not my employer&#8217;s)<\/strong>. Because I love leveraging new tech, I use AI tools like Gemini, NotebookLM, Claude, Perplexity and others as a &#8220;digital team&#8221; to help research and polish these articles so I can share the best possible insights with you!<\/p>\n\n\n\n<h4 class=\"wp-block-heading\"><em>How I Bypassed Ollama and LM studio Limitations on my Ryzen AI NPU to Hit 50+ TPS <\/em><\/h4>\n\n\n\n<p class=\"wp-block-paragraph\">If you recently purchased a modern AI-PC, you bought into a promising vision: a dedicated, cutting-edge Neural Processing Unit (NPU) sitting right inside your silicon, designed to stream large language models (LLMs) smoothly without draining your battery or spinning up your GPU fans. But if you are like me, your initial attempt to tap into that hardware felt like hitting a brick wall.<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">I started where most enthusiasts do: downloading industry staples like <strong>Ollama<\/strong> and <strong>LM Studio<\/strong>. I assumed they would automatically detect the specialized NPU. Instead, they ignored it entirely. Because these tools are architected for general-purpose GPU execution (CUDA\/ROCm), my state-of-the-art Ryzen AI chip was sidelined. The system defaulted to heavy CPU emulation, yielding a sluggish, disappointing <strong>3 to 10 tokens per second (TPS)<\/strong>. It was inefficient, and left a massive piece of my processor sitting idle.<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">Then, I discovered <strong><a href=\"https:\/\/fastflowlm.com\/\" target=\"_blank\" rel=\"noopener noreferrer nofollow\">FastFlowLM (FLM)<\/a><\/strong>.<\/p>\n\n\n\n<h3 class=\"wp-block-heading\">The Breakthrough: Native Hardware Optimization<\/h3>\n\n\n\n<p class=\"wp-block-paragraph\">FLM is built from the ground up to execute models natively on XDNA-based architectures. By leveraging AMD\u2019s specialized kernel drivers, it bypasses standard GPU-centric abstraction layers to communicate directly with the NPU silicon.<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">The difference was staggering. The moment I initiated the FLM environment, it claimed exclusive, low-overhead access to the NPU. My standard daily-driver models jumped from single-digit speeds to a consistent <strong>22 to 37 TPS<\/strong>. The generation was fluid, instant, and consumed virtually no background GPU resources. When I switched to edge-optimized small language models\u2014specifically Meta\u2019s <code>llama3.2:1b<\/code>\u2014the performance cracked an unbelievable <strong>50+ TPS<\/strong>.<\/p>\n\n\n\n<figure class=\"wp-block-table\"><table class=\"has-fixed-layout\"><thead><tr><th class=\"has-text-align-left\" data-align=\"left\">Inference Engine<\/th><th class=\"has-text-align-left\" data-align=\"left\">Target Hardware Block<\/th><th class=\"has-text-align-left\" data-align=\"left\">Avg. Performance<\/th><th class=\"has-text-align-left\" data-align=\"left\">User Experience<\/th><\/tr><\/thead><tbody><tr><td class=\"has-text-align-left\" data-align=\"left\">Ollama \/ LM Studio<\/td><td class=\"has-text-align-left\" data-align=\"left\">CPU (Fallback)<\/td><td class=\"has-text-align-left\" data-align=\"left\">3 \u2013 10 TPS<\/td><td class=\"has-text-align-left\" data-align=\"left\">Sluggish, high overhead<\/td><\/tr><tr><td class=\"has-text-align-left\" data-align=\"left\">FastFlowLM (FLM)<\/td><td class=\"has-text-align-left\" data-align=\"left\">Native Ryzen NPU<\/td><td class=\"has-text-align-left\" data-align=\"left\">22 \u2013 37 TPS<\/td><td class=\"has-text-align-left\" data-align=\"left\">Fluid, immediate<\/td><\/tr><tr><td class=\"has-text-align-left\" data-align=\"left\"><strong>FLM (Llama 3.2 1B)<\/strong><\/td><td class=\"has-text-align-left\" data-align=\"left\"><strong>Native Ryzen NPU<\/strong><\/td><td class=\"has-text-align-left\" data-align=\"left\"><strong>50+ TPS<\/strong><\/td><td class=\"has-text-align-left\" data-align=\"left\"><strong>Blindingly fast<\/strong><\/td><\/tr><\/tbody><\/table><\/figure>\n\n\n\n<h3 class=\"wp-block-heading\">Setting Up Your Environment<\/h3>\n\n\n\n<p class=\"wp-block-paragraph\">Configuration I used: <\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li>FastFlowLM  in the host machine &#8212; (the only one as of now to utilize NPU) \n<ul class=\"wp-block-list\">\n<li>unless you changed the default, Model will go into c:\\users\\&lt;user>\\.flm\\models<\/li>\n\n\n\n<li>Started with Llama3,2b:1B  but later added llama3.2:3b and gemma4 and qwen3.5:4B and they workeed GREAT! <\/li>\n<\/ul>\n<\/li>\n\n\n\n<li><\/li>\n\n\n\n<li>Docker (because I like to keep my machine clean, and can configure many items virtually)\n<ul class=\"wp-block-list\">\n<li>Running OpenWebUI (see yml file below) for testing<\/li>\n\n\n\n<li>Added the Local LLM  endpoint to my <a href=\"https:\/\/35sites.com\/applications\/unified-aichat-hub\/\" target=\"_blank\" rel=\"noopener noreferrer nofollow\">Unified AI CHat Hub Application<\/a> (as per below)<\/li>\n\n\n\n<li><\/li>\n<\/ul>\n<\/li>\n<\/ul>\n\n\n\n<p class=\"wp-block-paragraph\">To get started, <\/p>\n\n\n\n<ol class=\"wp-block-list\">\n<li>you must first install the <strong>FLM Application<\/strong> directly on your Windows or Linux OS (Check out their <a href=\"https:\/\/github.com\/julienM77\/FastFlowLM\" target=\"_blank\" rel=\"noopener noreferrer nofollow\">GitHub repository<\/a>). This binary acts as the bridge between your model files and the NPU silicon.\n<ul class=\"wp-block-list\">\n<li>Start FLM  from the terminal using the following command:    <\/li>\n\n\n\n<li><\/li>\n<\/ul>\n<\/li>\n\n\n\n<li><strong>flm serve llama3.2:1b &#8211;host 0.0.0.0<\/strong>\n<ul class=\"wp-block-list\">\n<li><\/li>\n\n\n\n<li>Note:   If you use: the standard command:    flm server lamma3.2.1b  and do not use the &#8211;host 0.0.0.0 flag it will only bind itself to 127.0.01  (localhost)   which means you will not be able to access it from other computers.    <\/li>\n<\/ul>\n<\/li>\n\n\n\n<li>You are not able to interact with llamma3.2.1b   on the terminal.   if you open another terminal session and type FLM list  you will see all of the models available (a lot!)   for your machine.<\/li>\n\n\n\n<li>While FLM runs well in a terminal, most users prefer a GUI to display the server status   and to interface with the chat \n<ul class=\"wp-block-list\">\n<li>for a Display GUI of the Server ( where you can start\/stop, configure the servier and also show NPU usage: there is a tool called  <a href=\"https:\/\/github.com\/julienM77\/flm-companion\" target=\"_blank\" rel=\"noopener noreferrer nofollow\">FLM Companion<\/a>  (highgjly recommended!)  <\/li>\n\n\n\n<li>For a front end chatbot I integrated my FLM instance into  a containerized workflow using <strong>OpenWebUI<\/strong> and <strong>n8n<\/strong>. Because OpenWebUI expects an OpenAI-compliant API, you can point it directly to your local FLM serving port.<\/li>\n<\/ul>\n<\/li>\n<\/ol>\n\n\n\n<p class=\"wp-block-paragraph\">If you are using Docker, configure your <code>docker-compose.yml<\/code> like this:<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>services:\n  open-webui:\n    image: ghcr.io\/open-webui\/open-webui:main\n    container_name: open-webui\n    ports:\n      - \"3000:8080\"\n    environment:\n      - ENABLE_OLLAMA=False\n      - OPENAI_API_BASE_URL=http:\/\/host.docker.internal:52625\/v1\n      - OPENAI_API_KEY=flm\n    extra_hosts:\n      - \"host.docker.internal:host-gateway\"<\/code><\/pre>\n\n\n\n<blockquote class=\"wp-block-quote is-layout-flow wp-block-quote-is-layout-flow\">\n<p class=\"wp-block-paragraph\"><strong>\u26a0\ufe0f Crucial Note:<\/strong> Because Docker containers operate in an isolated network, routing is key. <code>ENABLE_OLLAMA=False<\/code> prevents connection timeouts, while <code>host.docker.internal<\/code> (and the <code>host-gateway<\/code> flag) maps the container bridge back to your host machine&#8217;s FLM port.<\/p>\n<\/blockquote>\n\n\n\n<p class=\"wp-block-paragraph\">Alternatively, if you have a different chatbot interface ( like my own: <a href=\"https:\/\/35sites.com\/applications\/unified-aichat-hub\/\" target=\"_blank\" rel=\"noopener noreferrer nofollow\">Unified Chat Hub<\/a>)    use <\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>http:\/\/&lt;your.IP.ADDRESS>:52625\/v1\/chat\/completions<\/code><\/pre>\n\n\n\n<p class=\"wp-block-paragraph\"><\/p>\n\n\n\n<p class=\"wp-block-paragraph\"><\/p>\n\n\n\n<h3 class=\"wp-block-heading\">Analyzing Real-Time Execution<\/h3>\n\n\n\n<p class=\"wp-block-paragraph\">When a request hits your stack, the handoff is immediate. The execution logs confirm the NPU engagement:<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>&#91;\ud83d\udd17 ] TCP connection established - Remote: 127.0.0.1:61894\n&#91;\ud83d\udfe2 ] NPU Locked!\n&#91;FLM] Model: gemma4-it:e4b\n&#91;FLM] Start prefill... (14 tokens)\n&#91;FLM] Start generating...<\/code><\/pre>\n\n\n\n<p class=\"wp-block-paragraph\">The <code>[\ud83d\udfe2 ] NPU Locked!<\/code> status indicates that FLM has successfully bypassed the CPU\/GPU bottleneck and established a direct hardware lock. The model prefill phase caches your context into high-speed memory, allowing subsequent inference to leverage the NPU for near-instant response times.<\/p>\n\n\n\n<h3 class=\"wp-block-heading\">The Takeaway<\/h3>\n\n\n\n<p class=\"wp-block-paragraph\">NPUs represent a paradigm shift for local AI, but hardware is only as good as the software stack running on it. If you are stuck using legacy, GPU-dependent tools for your Ryzen AI silicon, you are leaving massive performance gains on the table. Moving to a native engine like FLM transforms your hardware from an underutilized component into a highly responsive, ultra-fast localized AI powerhouse.<\/p>\n\n\n\n<p class=\"wp-block-paragraph\"><\/p>\n\n\n\n<p class=\"wp-block-paragraph\">Update: AS I researched and read more about these guys I noticed that FLM is now embedded into <a href=\"https:\/\/lemonade-server.ai\/\" target=\"_blank\" rel=\"noopener noreferrer nofollow\">Lemonade: Local AI <\/a><\/p>\n\n\n\n<p class=\"wp-block-paragraph\"><\/p>\n\n\n<style>.wp-block-kadence-advancedheading.kt-adv-heading407818_afcbba-c7, .wp-block-kadence-advancedheading.kt-adv-heading407818_afcbba-c7[data-kb-block=\"kb-adv-heading407818_afcbba-c7\"]{font-size:var(--global-kb-font-size-sm, 0.9rem);font-style:normal;}.wp-block-kadence-advancedheading.kt-adv-heading407818_afcbba-c7 mark.kt-highlight, .wp-block-kadence-advancedheading.kt-adv-heading407818_afcbba-c7[data-kb-block=\"kb-adv-heading407818_afcbba-c7\"] mark.kt-highlight{font-style:normal;color:#f76a0c;-webkit-box-decoration-break:clone;box-decoration-break:clone;padding-top:0px;padding-right:0px;padding-bottom:0px;padding-left:0px;}.wp-block-kadence-advancedheading.kt-adv-heading407818_afcbba-c7 img.kb-inline-image, .wp-block-kadence-advancedheading.kt-adv-heading407818_afcbba-c7[data-kb-block=\"kb-adv-heading407818_afcbba-c7\"] img.kb-inline-image{width:150px;vertical-align:baseline;}<\/style>\n<p class=\"kt-adv-heading407818_afcbba-c7 wp-block-kadence-advancedheading\" data-kb-block=\"kb-adv-heading407818_afcbba-c7\">Have questions, ideas to share, or just want to connect? I\u2019d love to hear from you! Check out my <a href=\"https:\/\/jorgep.com\/blog\/about\/\">About Page<\/a> to learn more about me or connect with me.<\/p>\n\n\n\n<p class=\"wp-block-paragraph\"><\/p>\n\n\n\n<p class=\"wp-block-paragraph\"><\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li><strong><a href=\"https:\/\/jorgep.com\/blog\/category\/ai\/\" target=\"_blank\" rel=\"noreferrer noopener\">Jorge&#8217;s Local AI Series<\/a><\/strong>  |  <strong><a href=\"https:\/\/jorgep.com\/blog\/book-series-ai-dont-just-chat\/\" target=\"_blank\" rel=\"noreferrer noopener\">AI Learnings Series<\/a><\/strong> <\/li>\n\n\n\n<li><\/li>\n\n\n\n<li><strong><a href=\"https:\/\/ai.google.dev\/gemma\" target=\"_blank\" rel=\"noreferrer noopener\">Google Gemma 4 Technical Report<\/a><\/strong><\/li>\n\n\n\n<li><strong><a href=\"https:\/\/developer.nvidia.com\/nemotron\" target=\"_blank\" rel=\"noreferrer noopener\">NVIDIA Nemotron Nano 3 Omni Documentation<\/a><\/strong><\/li>\n\n\n\n<li><strong><a href=\"https:\/\/ollama.com\/\" target=\"_blank\" rel=\"noreferrer noopener\">Ollama (simplified local deployment)<\/a><\/strong><\/li>\n\n\n\n<li><strong><a href=\"https:\/\/github.com\/vllm-project\/vllm\" target=\"_blank\" rel=\"noreferrer noopener\">vLLM (throughput-optimized serving)<\/a><\/strong><\/li>\n<\/ul>\n\n\n\n<p class=\"wp-block-paragraph\"><\/p>\n","protected":false},"excerpt":{"rendered":"<p>How I Bypassed Ollama and LM studio Limitations on my Ryzen AI NPU to Hit 50+ TPS If you recently purchased a modern AI-PC, you bought into a promising vision: a dedicated, cutting-edge Neural Processing Unit (NPU) sitting right inside your silicon, designed to stream large language models (LLMs) smoothly without draining your battery or&#8230;<\/p>\n","protected":false},"author":2,"featured_media":427864,"comment_status":"closed","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"_kad_blocks_custom_css":"","_kad_blocks_head_custom_js":"","_kad_blocks_body_custom_js":"","_kad_blocks_footer_custom_js":"","ngg_post_thumbnail":0,"episode_type":"","audio_file":"","podmotor_file_id":"","podmotor_episode_id":"","cover_image":"","cover_image_id":"","duration":"","filesize":"","filesize_raw":"","date_recorded":"","explicit":"","block":"","itunes_episode_number":"","itunes_title":"","itunes_season_number":"","itunes_episode_type":"","_kad_post_transparent":"","_kad_post_title":"","_kad_post_layout":"","_kad_post_sidebar_id":"","_kad_post_content_style":"","_kad_post_vertical_padding":"","_kad_post_feature":"","_kad_post_feature_position":"","_kad_post_header":false,"_kad_post_footer":false,"_kad_post_classname":"","footnotes":""},"categories":[1031,441],"tags":[471,930,894,1053,876,986],"class_list":["post-521022","post","type-post","status-publish","format-standard","has-post-thumbnail","hentry","category-ai-learnings-series","category-tech-talk","tag-ai","tag-ai-series","tag-artificial-intelligence","tag-gemma4","tag-llm","tag-local-ai"],"taxonomy_info":{"category":[{"value":1031,"label":"AI Learnings Series"},{"value":441,"label":"Tech Talk"}],"post_tag":[{"value":471,"label":"AI"},{"value":930,"label":"AI Series"},{"value":894,"label":"artificial intelligence"},{"value":1053,"label":"gemma4"},{"value":876,"label":"LLM"},{"value":986,"label":"Local AI"}]},"featured_image_src_large":["https:\/\/jorgep.com\/blog\/wp-content\/uploads\/FeaturedImage-Topic-AI-1024x512.png",1024,512,true],"author_info":{"display_name":"Jorge Pereira","author_link":"https:\/\/jorgep.com\/blog\/author\/jorge\/"},"comment_info":0,"category_info":[{"term_id":1031,"name":"AI Learnings Series","slug":"ai-learnings-series","term_group":0,"term_taxonomy_id":1041,"taxonomy":"category","description":"","parent":0,"count":30,"filter":"raw","cat_ID":1031,"category_count":30,"category_description":"","cat_name":"AI Learnings Series","category_nicename":"ai-learnings-series","category_parent":0},{"term_id":441,"name":"Tech Talk","slug":"tech-talk","term_group":0,"term_taxonomy_id":451,"taxonomy":"category","description":"","parent":0,"count":720,"filter":"raw","cat_ID":441,"category_count":720,"category_description":"","cat_name":"Tech Talk","category_nicename":"tech-talk","category_parent":0}],"tag_info":[{"term_id":471,"name":"AI","slug":"ai","term_group":0,"term_taxonomy_id":481,"taxonomy":"post_tag","description":"","parent":0,"count":178,"filter":"raw"},{"term_id":930,"name":"AI Series","slug":"ai-series","term_group":0,"term_taxonomy_id":940,"taxonomy":"post_tag","description":"","parent":0,"count":185,"filter":"raw"},{"term_id":894,"name":"artificial intelligence","slug":"artificial-intelligence","term_group":0,"term_taxonomy_id":904,"taxonomy":"post_tag","description":"","parent":0,"count":180,"filter":"raw"},{"term_id":1053,"name":"gemma4","slug":"gemma4","term_group":0,"term_taxonomy_id":1063,"taxonomy":"post_tag","description":"","parent":0,"count":6,"filter":"raw"},{"term_id":876,"name":"LLM","slug":"llm","term_group":0,"term_taxonomy_id":886,"taxonomy":"post_tag","description":"","parent":0,"count":20,"filter":"raw"},{"term_id":986,"name":"Local AI","slug":"local-ai","term_group":0,"term_taxonomy_id":996,"taxonomy":"post_tag","description":"","parent":0,"count":48,"filter":"raw"}],"_links":{"self":[{"href":"https:\/\/jorgep.com\/blog\/wp-json\/wp\/v2\/posts\/521022","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/jorgep.com\/blog\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/jorgep.com\/blog\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/jorgep.com\/blog\/wp-json\/wp\/v2\/users\/2"}],"replies":[{"embeddable":true,"href":"https:\/\/jorgep.com\/blog\/wp-json\/wp\/v2\/comments?post=521022"}],"version-history":[{"count":10,"href":"https:\/\/jorgep.com\/blog\/wp-json\/wp\/v2\/posts\/521022\/revisions"}],"predecessor-version":[{"id":521038,"href":"https:\/\/jorgep.com\/blog\/wp-json\/wp\/v2\/posts\/521022\/revisions\/521038"}],"wp:featuredmedia":[{"embeddable":true,"href":"https:\/\/jorgep.com\/blog\/wp-json\/wp\/v2\/media\/427864"}],"wp:attachment":[{"href":"https:\/\/jorgep.com\/blog\/wp-json\/wp\/v2\/media?parent=521022"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/jorgep.com\/blog\/wp-json\/wp\/v2\/categories?post=521022"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/jorgep.com\/blog\/wp-json\/wp\/v2\/tags?post=521022"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}