diff --git a/Common/CUDA/FbpFiltration.cu b/Common/CUDA/FbpFiltration.cu
new file mode 100644
index 00000000..0a5db4bf
--- /dev/null
+++ b/Common/CUDA/FbpFiltration.cu
@@ -0,0 +1,256 @@
+/*-------------------------------------------------------------------------
+ *
+ * CUDA functions for convolution
+ *
+ * Applies the convolution filter in the Fourier space.
+ * The filter should be given in the Fourier transformed form.
+ *
+ * CODE by       Tomoyuki SADAKANE
+ * ---------------------------------------------------------------------------
+ * ---------------------------------------------------------------------------
+ * Copyright (c) 2015, University of Bath and CERN- European Organization for
+ * Nuclear Research
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ * ---------------------------------------------------------------------------
+ *
+ * Contact: tigre.toolbox@gmail.com
+ * Codes  : https://github.com/CERN/TIGRE
+ * ---------------------------------------------------------------------------
+ */
+
+#include "TIGRE_common.hpp"
+#include "FbpFiltration.hpp"
+#include <string>
+
+#define cudaCheckErrors(msg) \
+do { \
+        cudaError_t __err = cudaGetLastError(); \
+        if (__err != cudaSuccess) { \
+                mexPrintf("%s \n",msg);\
+                mexErrMsgIdAndTxt("apply_filtration",cudaGetErrorString(__err));\
+        } \
+} while (0)
+
+void cudafftCheckError(cufftResult_t fftResult, const std::string& rstrMsg) {
+    std::string strError = "Unknown error";
+    if (fftResult == CUFFT_SUCCESS){ return; }
+    else if (fftResult == CUFFT_INVALID_PLAN  ) { strError = "The plan parameter is not a valid handle. Handle is not valid when the plan is locked."; }
+    else if (fftResult == CUFFT_ALLOC_FAILED  ) { strError = "The allocation of GPU resources for the plan failed."; }
+    else if (fftResult == CUFFT_INVALID_VALUE ) { strError = "One or more invalid parameters were passed to the API."; }
+    else if (fftResult == CUFFT_INTERNAL_ERROR) { strError = "An internal driver error was detected."; }
+    else if (fftResult == CUFFT_SETUP_FAILED  ) { strError = "The cuFFT library failed to initialize."; }
+    else if (fftResult == CUFFT_INVALID_SIZE  ) { strError = "The nx or batch parameter is not a supported size."; }
+    mexPrintf("%s \n", rstrMsg.c_str());
+    mexErrMsgIdAndTxt("ApplyFiltration", strError.c_str());
+}
+
+__global__ void ApplyFilter(cufftComplex* pcfInOut, size_t uiULen, size_t uiVLen, float* pfFilter, float fULInv) {
+
+    size_t uiU = threadIdx.x + blockIdx.x * blockDim.x;
+    size_t uiV = threadIdx.y + blockIdx.y * blockDim.y;
+    if (uiV >= uiVLen || uiU >= uiULen) {
+        return;
+    }
+    pcfInOut[uiU+uiULen*uiV].x *= pfFilter[uiU]*fULInv;
+    pcfInOut[uiU+uiULen*uiV].y *= pfFilter[uiU]*fULInv;
+}
+
+//! Apply filter in the Fourier space
+void apply_filtration (const float* pfIn, size_t uiULen, size_t uiVLen, const float* pfFilter, float* pfOut, const GpuIds& gpuids) {
+    // Prepare for MultiGPU
+    int deviceCount = gpuids.GetLength();
+    cudaCheckErrors("Device query fail");
+    if (deviceCount == 0) {
+        mexErrMsgIdAndTxt("apply_filtration","There are no available device(s) that support CUDA\n");
+    }
+    //
+    // CODE assumes
+    // 1.-All available devices are usable by this code
+    // 2.-All available devices are equal, they are the same machine (warning thrown)
+    // Check the available devices, and if they are the same
+    if (!gpuids.AreEqualDevices()) {
+        mexWarnMsgIdAndTxt("apply_filtration","Detected one (or more) different GPUs.\n This code is not smart enough to separate the memory GPU wise if they have different computational times or memory limits.\n First GPU parameters used. If the code errors you might need to change the way GPU selection is performed.");
+    }
+    // USE THE FIRST GPU ONLY!!!!!!!!!!!!!!!!! 
+    cudaSetDevice(gpuids[0]);
+
+    const size_t uiLen = uiULen * uiVLen;
+    const float fULInv = 1./uiULen;
+
+    float* d_pfInOut = nullptr;
+    cudaMalloc((void **)&d_pfInOut, uiLen * sizeof(float));
+    cudaCheckErrors("apply_filtration fail cudaMalloc 1");
+    cudaMemcpy(d_pfInOut, pfIn, uiLen* sizeof(float), cudaMemcpyHostToDevice); // Sync only. pfIn is not pinned.
+    cudaCheckErrors("apply_filtration fail cudaMemcpy 1");
+
+    size_t uiBufferSize = (uiULen+1)/2+1;    // Buffer size for R2C. See https://docs.nvidia.com/cuda/cufft/
+
+	cufftHandle cudafftPlanFwd;
+	cufftHandle cudafftPlanInv;
+    const int iBatch = uiVLen;
+	cufftResult_t fftresult;
+    fftresult = cufftPlan1d(&cudafftPlanFwd, uiULen, CUFFT_R2C, iBatch);
+    cudafftCheckError(fftresult, "apply_filtration fail cufftPlan1d 1");
+    fftresult = cufftPlan1d(&cudafftPlanInv, uiULen, CUFFT_C2R, iBatch);
+    cudafftCheckError(fftresult, "apply_filtration fail cufftPlan1d 2");
+
+    float* d_pfFilter = nullptr;
+    cudaMalloc((void **)&d_pfFilter, uiULen * sizeof(float));
+    cudaCheckErrors("apply_filtration fail cudaMalloc 2");
+    cudaMemcpy(d_pfFilter, pfFilter, uiULen * sizeof(float), cudaMemcpyHostToDevice);
+    cudaCheckErrors("apply_filtration fail cudaMemcpy 2");
+
+    cufftComplex* d_pcfWork = nullptr;
+    cudaMalloc((void **)&d_pcfWork, uiBufferSize * uiVLen*sizeof(cufftComplex));
+    cudaCheckErrors("apply_filtration fail cudaMalloc 3");
+
+    {
+        const int divU = 128;//PIXEL_SIZE_BLOCK;
+        const int divV = 1;//PIXEL_SIZE_BLOCK;
+        dim3 grid((uiULen+divU-1)/divU,(uiVLen+divV-1)/divV,1);
+        dim3 block(divU,divV,1);
+        cufftSetStream(cudafftPlanFwd, 0);
+        cufftSetStream(cudafftPlanInv, 0);
+        fftresult = cufftExecR2C (cudafftPlanFwd, d_pfInOut, d_pcfWork);
+        cudafftCheckError(fftresult, "apply_filtration fail cufftExecR2C");
+        ApplyFilter<<<grid, block>>>(d_pcfWork, uiBufferSize, uiVLen, d_pfFilter, fULInv);// Kernel d_pcfInOut = d_pcfInOut * pfFilter / uiULen
+        fftresult = cufftExecC2R (cudafftPlanInv, d_pcfWork, d_pfInOut);
+        cudafftCheckError(fftresult, "apply_filtration fail cufftExecC2R");
+    }
+    cudaMemcpy(pfOut, d_pfInOut, uiLen*sizeof(float), cudaMemcpyDeviceToHost);
+    cudaCheckErrors("apply_filtration fail cudaMemcpy 3");
+
+    cudaFree(d_pcfWork); d_pcfWork = nullptr;
+    cudaFree(d_pfInOut); d_pfInOut = nullptr;
+    cudaFree(d_pfFilter); d_pfFilter = nullptr;
+    cufftDestroy(cudafftPlanFwd);
+    cufftDestroy(cudafftPlanInv);
+}
+
+
+//! Apply filter in the Fourier space
+void apply_filtration2 (const float* pfInAll, size_t uiOffset, size_t uiULen, size_t uiBatchTotal, const float* pfFilter, size_t uiFLen, float fScale, float* pfOut, const GpuIds& gpuids) {
+    // Prepare for MultiGPU
+    int deviceCount = gpuids.GetLength();
+    cudaCheckErrors("Device query fail");
+    if (deviceCount == 0) {
+        mexErrMsgIdAndTxt("apply_filtration2","There are no available device(s) that support CUDA\n");
+    }
+    //
+    // CODE assumes
+    // 1.-All available devices are usable by this code
+    // 2.-All available devices are equal, they are the same machine (warning thrown)
+    // Check the available devices, and if they are the same
+    if (!gpuids.AreEqualDevices()) {
+        mexWarnMsgIdAndTxt("apply_filtration2","Detected one (or more) different GPUs.\n This code is not smart enough to separate the memory GPU wise if they have different computational times or memory limits.\n First GPU parameters used. If the code errors you might need to change the way GPU selection is performed.");
+    }
+    const float* pfIn = pfInAll+uiOffset;
+    const float fFLInv = 1./uiFLen;
+    const size_t uiBufferSize = (uiFLen+1)/2+1;    // Buffer size for R2C. See https://docs.nvidia.com/cuda/cufft/
+    const size_t uiPaddingLen = (uiFLen-uiULen) / 2;
+    const size_t uiBatch0 = uiBatchTotal / deviceCount;
+
+    float** pd_pfProjWide = (float**)malloc(deviceCount*sizeof(float*));
+    float** pd_pfFilter = (float**)malloc(deviceCount*sizeof(float*));
+    cufftComplex** pd_pcfWork = (cufftComplex**)malloc(deviceCount*sizeof(cufftComplex*));
+
+    size_t* puiBatch = (size_t*)malloc(deviceCount*sizeof(size_t));
+	cufftHandle* pcudafftPlanFwd = (cufftHandle*)malloc(deviceCount*sizeof(cufftHandle));
+	cufftHandle* pcudafftPlanInv = (cufftHandle*)malloc(deviceCount*sizeof(cufftHandle));
+
+    cufftResult_t fftresult;
+    
+    for (int dev = 0; dev < deviceCount; ++dev) {
+        pd_pfProjWide[dev] = nullptr;
+        pd_pfFilter[dev] = nullptr;
+        pd_pcfWork[dev] = nullptr;
+        puiBatch[dev] = dev!=deviceCount-1? uiBatch0 : uiBatchTotal - uiBatch0*dev;
+        size_t uiBatch = puiBatch[dev];
+
+        // Allocate extended projection
+        cudaSetDevice(gpuids[dev]);
+        cudaCheckErrors("apply_filtration2 fail cudaSetDevice");
+        cudaMalloc((void**)&pd_pfProjWide[dev], uiFLen*uiBatch*sizeof(float));
+        cudaCheckErrors("apply_filtration2 fail cudaMalloc wide");
+        // Fill 0
+        cudaMemset(pd_pfProjWide[dev], 0, uiFLen*uiBatch*sizeof(float));
+        cudaCheckErrors("apply_filtration2 fail cudaMemset");
+        // Insert projection
+        cudaMemcpy2D(&pd_pfProjWide[dev][uiPaddingLen], uiFLen*sizeof(float), &pfIn[dev*uiBatch0*uiULen], uiULen*sizeof(float), uiULen*sizeof(float), uiBatch, cudaMemcpyHostToDevice);
+        cudaCheckErrors("apply_filtration2 fail cudaMemcpy2D 1");
+
+        // Filter
+        cudaMalloc((void **)&pd_pfFilter[dev], uiFLen * sizeof(float));
+        cudaCheckErrors("apply_filtration2 fail cudaMalloc filter");
+        cudaMemcpy(pd_pfFilter[dev], pfFilter, uiFLen * sizeof(float), cudaMemcpyHostToDevice);
+        cudaCheckErrors("apply_filtration2 fail cudaMemcpy 2");
+
+        // Work space for FFT.
+        cudaMalloc((void **)&pd_pcfWork[dev], uiBufferSize * uiBatch*sizeof(cufftComplex));
+        cudaCheckErrors("apply_filtration2 fail cudaMalloc work");
+    }
+    for (int dev = 0; dev < deviceCount; ++dev) {
+        cudaSetDevice(gpuids[dev]);
+        const size_t uiBatch = puiBatch[dev];
+        fftresult = cufftPlan1d(&pcudafftPlanFwd[dev], uiFLen, CUFFT_R2C, uiBatch);
+        cudafftCheckError(fftresult, "apply_filtration2 fail cufftPlan1d 1");
+        fftresult = cufftPlan1d(&pcudafftPlanInv[dev], uiFLen, CUFFT_C2R, uiBatch);
+        cudafftCheckError(fftresult, "apply_filtration2 fail cufftPlan1d 2");
+        const int divU = 128;//PIXEL_SIZE_BLOCK;
+        const int divV = 1;//PIXEL_SIZE_BLOCK;
+        dim3 grid((uiFLen+divU-1)/divU,(uiBatch+divV-1)/divV,1);
+        dim3 block(divU,divV,1);
+        fftresult = cufftExecR2C (pcudafftPlanFwd[dev], pd_pfProjWide[dev], pd_pcfWork[dev]);
+        cudafftCheckError(fftresult, "apply_filtration2 fail cufftExecR2C");
+        ApplyFilter<<<grid, block>>>(pd_pcfWork[dev], uiBufferSize, uiBatch, pd_pfFilter[dev], fFLInv*fScale);// Kernel d_pcfInOut = d_pcfInOut * pfFilter / uiFLen * 
+        fftresult = cufftExecC2R (pcudafftPlanInv[dev], pd_pcfWork[dev], pd_pfProjWide[dev]);
+        cudafftCheckError(fftresult, "apply_filtration2 fail cufftExecC2R");
+    }
+    for (int dev = 0; dev < deviceCount; ++dev) {
+        cudaSetDevice(gpuids[dev]);
+        const size_t uiBatch = puiBatch[dev];
+        cudaMemcpy2D(&pfOut[dev*uiBatch0*uiULen], uiULen*sizeof(float), &pd_pfProjWide[dev][uiPaddingLen], uiFLen*sizeof(float), uiULen*sizeof(float), uiBatch, cudaMemcpyDeviceToHost);
+        cudaCheckErrors("apply_filtration2 fail cudaMemcpy2D 2");
+
+        cudaFree(pd_pcfWork[dev]); pd_pcfWork[dev] = nullptr;
+        cudaFree(pd_pfProjWide[dev]); pd_pfProjWide[dev] = nullptr;
+        cudaFree(pd_pfFilter[dev]); pd_pfFilter[dev] = nullptr;
+        cufftDestroy(pcudafftPlanFwd[dev]);
+        cufftDestroy(pcudafftPlanInv[dev]);
+        cudaCheckErrors("apply_filtration2 fail free");
+    }
+    cudaDeviceSynchronize();
+
+    free(pd_pfProjWide); pd_pfProjWide = nullptr;
+    free(pd_pfFilter); pd_pfFilter = nullptr;
+    free(puiBatch); puiBatch = nullptr;
+    free(pd_pcfWork); pd_pcfWork = nullptr;
+    free(pcudafftPlanFwd); pcudafftPlanFwd = nullptr;
+    free(pcudafftPlanInv); pcudafftPlanInv = nullptr;
+}
diff --git a/Common/CUDA/FbpFiltration.hpp b/Common/CUDA/FbpFiltration.hpp
new file mode 100644
index 00000000..ab8fd792
--- /dev/null
+++ b/Common/CUDA/FbpFiltration.hpp
@@ -0,0 +1,54 @@
+/*-------------------------------------------------------------------------
+ *
+ * Header CUDA functions for convolution
+ *
+ * Applies the convolution filter in the Fourier space.
+ * The filter should be given in the Fourier transformed form.
+ *
+ * CODE by       Tomoyuki SADAKANE
+ * ---------------------------------------------------------------------------
+ * ---------------------------------------------------------------------------
+ * Copyright (c) 2015, University of Bath and CERN- European Organization for
+ * Nuclear Research
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ * ---------------------------------------------------------------------------
+ *
+ * Contact: tigre.toolbox@gmail.com
+ * Codes  : https://github.com/CERN/TIGRE
+ * ---------------------------------------------------------------------------
+ */
+
+#include "TIGRE_common.hpp"
+#include "GpuIds.hpp"
+
+#include <complex>
+#include <cufft.h>
+
+void apply_filtration(const float* pfIn, size_t uiXLen, size_t uiYLen, const float* pfFilter, float* pfOut, const GpuIds& gpuids);
+void apply_filtration2 (const float* pfInAll, size_t uiOffset, size_t uiULen, size_t uiBatch, const float* pfFilter, size_t uiFLen, float fScale, float* pfOut, const GpuIds& gpuids) ;
diff --git a/MATLAB/Algorithms/FBP.m b/MATLAB/Algorithms/FBP.m
index 428d0421..8775c440 100644
--- a/MATLAB/Algorithms/FBP.m
+++ b/MATLAB/Algorithms/FBP.m
@@ -43,7 +43,6 @@
 
 %% Weight
 %proj=data
-proj=permute(proj,[2 1 3]);
 
 %% filter
 proj_filt = filtering(proj,geo,angles,parker); % Not sure if offsets are good in here
diff --git a/MATLAB/Algorithms/FDK.m b/MATLAB/Algorithms/FDK.m
index 58affe95..00500523 100644
--- a/MATLAB/Algorithms/FDK.m
+++ b/MATLAB/Algorithms/FDK.m
@@ -38,7 +38,7 @@
 % Codes:              https://github.com/CERN/TIGRE/
 % Coded by:           Kyungsang Kim, modified by Ander Biguri, Brandon Nelson 
 %--------------------------------------------------------------------------
-[filter,parker,dowang,gpuids]=parse_inputs(angles,varargin);
+[filter,parker,dowang,usegpufft,gpuids]=parse_inputs(angles,varargin);
 
 geo=checkGeo(geo,angles);
 geo.filter=filter;
@@ -59,9 +59,7 @@
 end
 
 %% Weight
-proj=permute(proj,[2 1 3]);
 for ii=1:size(angles,2)
-    
     us = ((-geo.nDetector(1)/2+0.5):1:(geo.nDetector(1)/2-0.5))*geo.dDetector(1) + offset(1,ii);
     vs = ((-geo.nDetector(2)/2+0.5):1:(geo.nDetector(2)/2-0.5))*geo.dDetector(2) + offset(2,ii);
     [uu,vv] = meshgrid(us,vs); % detector
@@ -70,10 +68,10 @@
     w = (geo.DSD(ii))./sqrt((geo.DSD(ii))^2+uu.^2 + vv.^2);
     
     % Multiply the weights with projection data
-    proj(:,:,ii) = proj(:,:,ii).*w';
+    proj(:,:,ii) = w.*proj(:,:,ii);
 end
 %% Fourier transform based filtering
-proj = filtering(proj,geo,angles,parker); % Not sure if offsets are good in here
+proj = filtering(proj,geo,angles,parker, 'usegpufft', usegpufft); % Not sure if offsets are good in here
 
 % RMFIELD Remove fields from a structure array.
 geo=rmfield(geo,'filter');
@@ -113,9 +111,9 @@
 end
 
 
-function [filter, parker, wang, gpuids]=parse_inputs(angles,argin)
+function [filter, parker, wang, usegpufft, gpuids]=parse_inputs(angles,argin)
 
-opts =  {'filter','parker','wang','gpuids'};
+opts =  {'filter','parker','wang','usegpufft','gpuids'};
 defaults=ones(length(opts),1);
 
 % Check inputs
@@ -176,6 +174,12 @@
                 end
                 filter=val;
             end
+        case 'usegpufft'
+            if default
+                usegpufft=2;
+            else
+                usegpufft=val;
+            end            
         case 'gpuids'
             if default
                 gpuids = GpuIds();
diff --git a/MATLAB/Compile.m b/MATLAB/Compile.m
index fdb45b07..c45686cb 100644
--- a/MATLAB/Compile.m
+++ b/MATLAB/Compile.m
@@ -97,6 +97,8 @@
     mex('-largeArrayDims', './Utilities/cuda_interface/AwminTV.cpp', '../Common/CUDA/POCS_TV2.cu', '../Common/CUDA/GpuIds.cpp', '../Common/CUDA/gpuUtils.cu', '-outdir', outdir, FLAGS)
     mex('-largeArrayDims', './Utilities/cuda_interface/tvDenoise.cpp', '../Common/CUDA/tvdenoising.cu', '../Common/CUDA/GpuIds.cpp', '../Common/CUDA/gpuUtils.cu', '-outdir', outdir, FLAGS)
     mex('-largeArrayDims', './Utilities/cuda_interface/AddNoise.cpp', '../Common/CUDA/RandomNumberGenerator.cu', '../Common/CUDA/GpuIds.cpp', '../Common/CUDA/gpuUtils.cu', '-outdir', outdir, FLAGS)
+    mex('-largeArrayDims', './Utilities/cuda_interface/ApplyFbpFiltration.cpp', '../Common/CUDA/FbpFiltration.cu', '../Common/CUDA/GpuIds.cpp', '../Common/CUDA/gpuUtils.cu', '-outdir', outdir, FLAGS)
+    mex('-largeArrayDims', './Utilities/cuda_interface/ApplyPaddingAndFbpFiltration.cpp', '../Common/CUDA/FbpFiltration.cu', '../Common/CUDA/GpuIds.cpp', '../Common/CUDA/gpuUtils.cu', '-outdir', outdir, FLAGS)
     mex('-largeArrayDims', './Utilities/IO/VarianCBCT/mexReadXim.cpp', '../Common/CUDA/gpuUtils.cu', '-outdir', outdir, FLAGS)
     mex('-largeArrayDims', './Utilities/GPU/getGpuName_mex.cpp', '../Common/CUDA/gpuUtils.cu', '-outdir', outdir, FLAGS)
     mex('-largeArrayDims', './Utilities/GPU/getGpuCount_mex.cpp', '../Common/CUDA/gpuUtils.cu', '-outdir', outdir, FLAGS)
@@ -109,6 +111,8 @@
     mex( './Utilities/cuda_interface/AwminTV.cpp', '../Common/CUDA/POCS_TV2.cu', '../Common/CUDA/GpuIds.cpp', '../Common/CUDA/gpuUtils.cu', '-outdir', outdir, FLAGS)
     mex( './Utilities/cuda_interface/tvDenoise.cpp', '../Common/CUDA/tvdenoising.cu', '../Common/CUDA/GpuIds.cpp', '../Common/CUDA/gpuUtils.cu', '-outdir', outdir, FLAGS)
     mex( './Utilities/cuda_interface/AddNoise.cpp', '../Common/CUDA/RandomNumberGenerator.cu', '../Common/CUDA/GpuIds.cpp', '../Common/CUDA/gpuUtils.cu', '-outdir', outdir, FLAGS)
+    mex( './Utilities/cuda_interface/ApplyFbpFiltration.cpp', '../Common/CUDA/FbpFiltration.cu', '../Common/CUDA/GpuIds.cpp', '../Common/CUDA/gpuUtils.cu', '-outdir', outdir, FLAGS)
+    mex( './Utilities/cuda_interface/ApplyPaddingAndFbpFiltration.cpp', '../Common/CUDA/FbpFiltration.cu', '../Common/CUDA/GpuIds.cpp', '../Common/CUDA/gpuUtils.cu', '-outdir', outdir, FLAGS)
     mex( './Utilities/IO/VarianCBCT/mexReadXim.cpp', '../Common/CUDA/gpuUtils.cu', '-outdir', outdir, FLAGS)
     mex( './Utilities/GPU/getGpuName_mex.cpp', '../Common/CUDA/gpuUtils.cu', '-outdir', outdir, FLAGS)
     mex('./Utilities/GPU/getGpuCount_mex.cpp', '../Common/CUDA/gpuUtils.cu', '-outdir', outdir, FLAGS)
diff --git a/MATLAB/Utilities/cuda_interface/ApplyFbpFiltration.cpp b/MATLAB/Utilities/cuda_interface/ApplyFbpFiltration.cpp
new file mode 100644
index 00000000..ef9cc8e6
--- /dev/null
+++ b/MATLAB/Utilities/cuda_interface/ApplyFbpFiltration.cpp
@@ -0,0 +1,130 @@
+/*-------------------------------------------------------------------------
+ *
+ * MATLAB MEX  functions for convolution. Check inputs and parses 
+ * MATLAB data to C++ data.
+ *
+ *
+ * CODE by       Tomoyuki SADAKANE
+ *
+---------------------------------------------------------------------------
+---------------------------------------------------------------------------
+Copyright (c) 2015, University of Bath and CERN- European Organization for 
+Nuclear Research
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without 
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, 
+this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice, 
+this list of conditions and the following disclaimer in the documentation 
+and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors
+may be used to endorse or promote products derived from this software without
+specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+ ---------------------------------------------------------------------------
+
+Contact: tigre.toolbox@gmail.com
+Codes  : https://github.com/CERN/TIGRE
+--------------------------------------------------------------------------- 
+ */
+
+#include <math.h>
+#include <string.h>
+#include <tmwtypes.h>
+#include <mex.h>
+#include <matrix.h>
+#include <CUDA/FbpFiltration.hpp>
+#include <CUDA/GpuIds.hpp>
+#include <CUDA/gpuUtils.hpp>
+/**
+ * MEX gateway
+ * ApplyFbpFiltration(mat2dProj, mat1dFlt, "gpuids", gpuids);
+ */
+
+void mexFunction(int nlhs, mxArray *plhs[],
+                 int nrhs, mxArray const *prhs[])
+{
+    GpuIds gpuids;
+    if (nrhs==4) {
+        size_t iM = mxGetM(prhs[3]);
+        if (iM != 1) {
+            mexErrMsgIdAndTxt( "TIGRE:MEX:ApplyFbpFiltration:unknown","5th parameter must be a row vector.");
+            return;
+        }
+        size_t uiGpuCount = mxGetN(prhs[3]);
+        if (uiGpuCount == 0) {
+            mexErrMsgIdAndTxt( "TIGRE:MEX:ApplyFbpFiltration:unknown","5th parameter must be a row vector.");
+            return;
+        }
+        int* piGpuIds = (int*)mxGetData(prhs[3]);
+        gpuids.SetIds(uiGpuCount, piGpuIds);
+    } else {
+        int iGpuCount = GetGpuCount();
+        int* piDev = (int*)malloc(iGpuCount * sizeof(int));
+        for (int iI = 0; iI < iGpuCount; ++iI) {
+            piDev[iI] = iI;
+        }
+        gpuids.SetIds(iGpuCount, piDev);
+        free(piDev); piDev = 0;
+    }
+    if (nrhs < 2) {
+        mexErrMsgIdAndTxt("TIGRE:MEX:ApplyFbpFiltration", "At least two input argumet required.");
+    } else if (nrhs==2 || nrhs==4){
+        size_t mrows = mxGetM(prhs[0]);
+        size_t ncols = mxGetN(prhs[0]);
+        if (mrows ==1 && ncols ==1) {
+            mexErrMsgIdAndTxt("TIGRE:MEX:ApplyFbpFiltration", "1st parameter should not be 1x1");
+        }
+        mrows = mxGetM(prhs[1]);
+        ncols = mxGetN(prhs[1]);
+        if (mrows==1 || ncols !=1) {
+            mexErrMsgIdAndTxt("TIGRE:MEX:ApplyFbpFiltration", "2nd parameter should be Mx1 (M>1)");
+        }
+    } else if (nrhs>4) {
+        mexErrMsgIdAndTxt("TIGRE:MEX:ApplyFbpFiltration", "Too many imput argumets");
+    }
+    /////////////// First input argumet. Projection
+    mxArray const * const proj = prhs[0];
+    float* pfProj = static_cast<float*>(mxGetData(proj));
+    const mwSize numDimsProj = mxGetNumberOfDimensions(proj);  // get dim of proj
+    const mwSize *size_proj= mxGetDimensions(proj); //get size of proj
+    // printf("numDimsProj = %d\n", numDimsProj);
+    // for (int iI = 0; iI < numDimsProj; ++iI) {
+    //    printf("size_proj[%d] = %d\n", iI, size_proj[iI]);
+    // }
+    /////////////// Second input argumet. Filter
+    const mxArray* filter = prhs[1];
+    float* pfFilter = static_cast<float*>(mxGetData(filter));
+    const mwSize numDimsFilter = mxGetNumberOfDimensions(filter);  // get dim of filter
+    const mwSize *size_filter= mxGetDimensions(filter); //get size of filter
+    // printf("numDimsFilter = %d\n", numDimsFilter);
+    // for (int iI = 0; iI < numDimsFilter; ++iI) {
+    //    printf("size_filter[%d] = %d\n", iI, size_filter[iI]);
+    // }
+    if (size_filter[0] != size_proj[0]) {
+        mexErrMsgIdAndTxt("TIGRE:MEX:ApplyFbpFiltration", "Width of projection must be equal to filter.");
+    }
+    //////////////
+    //prepare outputs
+    // Allocate output projection
+    plhs[0] = mxCreateNumericArray(numDimsProj, size_proj, mxSINGLE_CLASS, mxREAL);
+    float *pfProjOut =(float*) mxGetPr(plhs[0]);
+    // call CUDA filtering
+    apply_filtration(pfProj, size_proj[0], size_proj[1], pfFilter, pfProjOut, gpuids);
+}
diff --git a/MATLAB/Utilities/cuda_interface/ApplyPaddingAndFbpFiltration.cpp b/MATLAB/Utilities/cuda_interface/ApplyPaddingAndFbpFiltration.cpp
new file mode 100644
index 00000000..3fb78493
--- /dev/null
+++ b/MATLAB/Utilities/cuda_interface/ApplyPaddingAndFbpFiltration.cpp
@@ -0,0 +1,136 @@
+/*-------------------------------------------------------------------------
+ *
+ * MATLAB MEX  functions for convolution. Check inputs and parses 
+ * MATLAB data to C++ data.
+ *
+ *
+ * CODE by       Tomoyuki SADAKANE
+ *
+---------------------------------------------------------------------------
+---------------------------------------------------------------------------
+Copyright (c) 2015, University of Bath and CERN- European Organization for 
+Nuclear Research
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without 
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, 
+this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice, 
+this list of conditions and the following disclaimer in the documentation 
+and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors
+may be used to endorse or promote products derived from this software without
+specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+ ---------------------------------------------------------------------------
+
+Contact: tigre.toolbox@gmail.com
+Codes  : https://github.com/CERN/TIGRE
+--------------------------------------------------------------------------- 
+ */
+
+#include <math.h>
+#include <string.h>
+#include <tmwtypes.h>
+#include <mex.h>
+#include <matrix.h>
+#include <CUDA/FbpFiltration.hpp>
+#include <CUDA/GpuIds.hpp>
+#include <CUDA/gpuUtils.hpp>
+/**
+ * MEX gateway
+ * ApplyPaddingAndFbpFiltration(mat2dProj, idx_min, idx_max, mat1dFlt, scale, gpuids);
+ */
+
+void mexFunction(int nlhs, mxArray *plhs[],
+                 int nrhs, mxArray const *prhs[])
+{
+    const int kiArgcMax = 6;
+    GpuIds gpuids;
+    if (nrhs==kiArgcMax) {
+        size_t iM = mxGetM(prhs[kiArgcMax-1]);
+        if (iM != 1) {
+            mexErrMsgIdAndTxt( "TIGRE:MEX:ApplyFbpFiltration:unknown","gpuids parameter must be a row vector.");
+            return;
+        }
+        size_t uiGpuCount = mxGetN(prhs[kiArgcMax-1]);
+        if (uiGpuCount == 0) {
+            mexErrMsgIdAndTxt( "TIGRE:MEX:ApplyFbpFiltration:unknown","gpuids parameter must be a row vector.");
+            return;
+        }
+        int* piGpuIds = (int*)mxGetData(prhs[kiArgcMax-1]);
+        gpuids.SetIds(uiGpuCount, piGpuIds);
+    } else {
+        int iGpuCount = GetGpuCount();
+        int* piDev = (int*)malloc(iGpuCount * sizeof(int));
+        for (int iI = 0; iI < iGpuCount; ++iI) {
+            piDev[iI] = iI;
+        }
+        gpuids.SetIds(iGpuCount, piDev);
+        free(piDev); piDev = 0;
+    }
+    if (nrhs < kiArgcMax-1) {
+        mexErrMsgIdAndTxt("TIGRE:MEX:ApplyFbpFiltration", "At least two input argumet required.");
+    } else if (nrhs==kiArgcMax-1 || nrhs==kiArgcMax){
+        size_t mrows = mxGetM(prhs[0]);
+        size_t ncols = mxGetN(prhs[0]);
+        if (mrows ==1 && ncols ==1) {
+            mexErrMsgIdAndTxt("TIGRE:MEX:ApplyFbpFiltration", "1st parameter should not be 1x1");
+        }
+        mrows = mxGetM(prhs[3]);
+        ncols = mxGetN(prhs[3]);
+        if (mrows==1 || ncols !=1) {
+            mexErrMsgIdAndTxt("TIGRE:MEX:ApplyFbpFiltration", "2nd parameter should be Mx1 (M>1)");
+        }
+    } else if (nrhs<5) {
+        mexErrMsgIdAndTxt("TIGRE:MEX:ApplyFbpFiltration", "Too many imput argumets");
+    }
+    /////////////// First input argumet. Projection
+    mxArray const * const proj = prhs[0];
+    float* pfProj = static_cast<float*>(mxGetData(proj));
+    const mwSize numDimsProj = mxGetNumberOfDimensions(proj);  // get dim of proj
+    const mwSize *size_proj= mxGetDimensions(proj); //get size of proj
+    // printf("numDimsProj = %d\n", numDimsProj);
+    // for (int iI = 0; iI < numDimsProj; ++iI) {
+    //    printf("size_proj[%d] = %d\n", iI, size_proj[iI]);
+    // }
+    // 2nd and 3rd inputs: From idx_begin-th to idx_end-th projections are processed.
+    // Note: idx_end is is not included in the range.
+    size_t idx_begin = (size_t)mxGetScalar(prhs[1]);
+    size_t idx_end   = (size_t)mxGetScalar(prhs[2]);
+    /////////////// 4th input argumet. Filter
+    const mxArray* filter = prhs[3];
+    float* pfFilter = static_cast<float*>(mxGetData(filter));
+    const mwSize numDimsFilter = mxGetNumberOfDimensions(filter);  // get dim of filter
+    const mwSize *size_filter= mxGetDimensions(filter); //get size of filter
+    // 5th input: Scaling
+    float fScale = (float)mxGetScalar(prhs[4]);
+    
+    size_t uiOffset = (idx_begin-1)*size_proj[0]*size_proj[1];
+    //////////////
+    //prepare outputs
+    // Allocate output projection
+    mwSize size_ret[3];
+    size_ret[0] = size_proj[0];
+    size_ret[1] = size_proj[1];
+    size_ret[2] = idx_end-idx_begin;
+    plhs[0] = mxCreateNumericArray(3, size_ret, mxSINGLE_CLASS, mxREAL);
+    float *pfProjOut =(float*) mxGetPr(plhs[0]);
+    // call CUDA filtering
+    apply_filtration2(pfProj, uiOffset, size_ret[0], size_ret[1]*size_ret[2], pfFilter, size_filter[0], fScale, pfProjOut, gpuids);
+}
diff --git a/MATLAB/Utilities/filtering.m b/MATLAB/Utilities/filtering.m
index 8974e875..7a8cc874 100644
--- a/MATLAB/Utilities/filtering.m
+++ b/MATLAB/Utilities/filtering.m
@@ -1,4 +1,4 @@
-function [ proj ] = filtering(proj,geo,angles,parker)
+function [ proj ] = filtering(proj,geo,angles,parker,varargin)
 %FILTERING Summary of this function goes here
 %   Detailed explanation goes here
 %--------------------------------------------------------------------------
@@ -19,9 +19,10 @@
 % Codes:              https://github.com/CERN/TIGRE/
 % Coded by:           Kyungsang Kim, modified by Ander Biguri 
 %--------------------------------------------------------------------------
+[usegpufft,gpuids]=parse_inputs(varargin);
 
 if parker
-	proj = permute(ParkerWeight(permute(proj,[2 1 3]),geo,angles,parker),[2 1 3]);
+	proj = ParkerWeight(proj,geo,angles,parker);
 	diff_angles = diff(angles(1,:)); angle_step = mean(abs(diff_angles)); % to be used later
 end 
 
@@ -30,26 +31,57 @@
 
 d = 1; % cut off (0~1)
 [filt] = Filter(geo.filter, ramp_kernel, filt_len, d);
-filt = repmat(filt',[1 geo.nDetector(2)]);
+if usegpufft>0
+    filt = filt';
+else
+    filt = repmat(filt',[1 geo.nDetector(2)]);
+end
 
-for ii=1:size(angles,2)
-    
-    fproj = (zeros(filt_len,geo.nDetector(2),'single'));
-    
-    fproj(round(filt_len/2-geo.nDetector(1)/2+1):round(filt_len/2+geo.nDetector(1)/2),:) = proj(:,:,ii);
-    
-    fproj = fft(fproj);   
-    
-    fproj = fproj.*filt;
-    
-    fproj = (real(ifft(fproj)));
-    
-    if parker
-        proj(:,:,ii) = fproj(round(end/2-geo.nDetector(1)/2+1):round(end/2+geo.nDetector(1)/2),:)/2/geo.dDetector(1)*(2*pi/  (pi/angle_step)   )/2*(geo.DSD(ii)/geo.DSO(ii));
-    else
-    	proj(:,:,ii) = fproj(round(end/2-geo.nDetector(1)/2+1):round(end/2+geo.nDetector(1)/2),:)/2/geo.dDetector(1)*(2*pi/  size(angles,2)   )/2*(geo.DSD(ii)/geo.DSO(ii));
-    end 
+proj=permute(proj,[2 1 3]);
+
+if usegpufft==2
+    bundle_size = 32;  %len(gpuids)
+    n_bundles = floor((size(angles,2)+bundle_size-1)/bundle_size);
+    n_angles = size(angles, 2);
+    for ii=1:n_bundles
+        
+        if ii ~= n_bundles
+            bundle_size_actual = bundle_size;
+        else
+            bundle_size_actual = n_angles-(ii-1)*bundle_size;
+        end
+        idx_begin = (ii-1)*bundle_size+1;
+        idx_end = (ii-1)*bundle_size+bundle_size_actual+1;
+        proj_flt = ApplyPaddingAndFbpFiltration(proj, idx_begin, idx_end, filt, 1, gpuids.devices);
+        bundle_range = idx_begin:(idx_end-1);
+        if parker
+            proj_flt = proj_flt/2/geo.dDetector(1)*(2*pi/  (pi/angle_step)  )/2*(geo.DSD(bundle_range)/geo.DSO(bundle_range));
+        else
+            proj_flt = proj_flt/2/geo.dDetector(1)*(2*pi/  size(angles,2)   )/2*(geo.DSD(bundle_range)/geo.DSO(bundle_range));
+        end
+        proj(:,:,bundle_range) = proj_flt;
+    end
+else 
     
+    for ii=1:size(angles,2)
+
+        fproj = (zeros(filt_len,geo.nDetector(2),'single'));
+
+        fproj(round(filt_len/2-geo.nDetector(1)/2+1):round(filt_len/2+geo.nDetector(1)/2),:) = proj(:,:,ii);
+        if usegpufft==1
+            fproj = ApplyFbpFiltration(fproj, filt, gpuids);
+        else
+            fproj = fft(fproj);   
+            fproj = fproj.*filt;
+            fproj = (real(ifft(fproj)));
+        end
+        if parker
+            proj(:,:,ii) = fproj(round(end/2-geo.nDetector(1)/2+1):round(end/2+geo.nDetector(1)/2),:)/2/geo.dDetector(1)*(2*pi/  (pi/angle_step)   )/2*(geo.DSD(ii)/geo.DSO(ii));
+        else
+            proj(:,:,ii) = fproj(round(end/2-geo.nDetector(1)/2+1):round(end/2+geo.nDetector(1)/2),:)/2/geo.dDetector(1)*(2*pi/  size(angles,2)   )/2*(geo.DSD(ii)/geo.DSO(ii));
+        end 
+
+    end
 end
 
 proj=permute(proj,[2 1 3]);
@@ -92,3 +124,59 @@
 return
 
 end
+
+function [usegpufft, gpuids]=parse_inputs(argin)
+
+opts =  {'usegpufft','gpuids'};
+defaults=ones(length(opts),1);
+
+% Check inputs
+nVarargs = length(argin);
+if mod(nVarargs,2)
+    error('TIGRE:filtering:InvalidInput','Invalid number of inputs')
+end
+
+% check if option has been passed as input
+for ii=1:2:nVarargs
+    ind=find(ismember(opts,lower(argin{ii})));
+    if ~isempty(ind)
+        defaults(ind)=0;
+    end
+end
+
+for ii=1:length(opts)
+    opt=opts{ii};
+    default=defaults(ii);
+    % if one option is not default, then extract value from input
+    if default==0
+        ind=double.empty(0,1);jj=1;
+        while isempty(ind)
+            ind=find(isequal(opt,lower(argin{jj})));
+            jj=jj+1;
+        end
+        if isempty(ind)
+            error('TIGRE:filtering:InvalidInput',['Optional parameter "' argin{jj} '" does not exist' ]); 
+        end
+        val=argin{jj};
+    end
+    
+    switch opt
+        case 'usegpufft'
+            if default
+                usegpufft=2;
+            else
+                usegpufft=val;
+            end            
+        case 'gpuids'
+            if default
+                gpuids = GpuIds();
+            else
+                gpuids = val;
+            end
+       
+        otherwise
+            error('TIGRE:filtering:InvalidInput',['Invalid input name:', num2str(opt),'\n No such option in FDK()']);
+    end
+end
+
+end
diff --git a/MATLAB/mex_CUDA_win64_MSV2019.xml b/MATLAB/mex_CUDA_win64_MSV2019.xml
index 5f2f0a12..892262c7 100644
--- a/MATLAB/mex_CUDA_win64_MSV2019.xml
+++ b/MATLAB/mex_CUDA_win64_MSV2019.xml
@@ -44,7 +44,7 @@
         LINKFLAGS="/nologo /manifest"
         LINKTYPE="/DLL "
         LINKEXPORT="/EXPORT:mexFunction"
-        LINKLIBS="/LIBPATH:&quot;$MATLABROOT\extern\lib\$ARCH\microsoft&quot; libmx.lib libmex.lib libmat.lib cudart.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib  ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib"
+        LINKLIBS="/LIBPATH:&quot;$MATLABROOT\extern\lib\$ARCH\microsoft&quot; libmx.lib libmex.lib libmat.lib cudart.lib cufft.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib  ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib"
         LINKDEBUGFLAGS="/debug /PDB:&quot;$TEMPNAME$LDEXT.pdb&quot;"
         LINKOPTIMFLAGS=""
         OBJEXT=".obj"
diff --git a/MATLAB/mex_CUDA_win64_MVS2013.xml b/MATLAB/mex_CUDA_win64_MVS2013.xml
index e6620936..91d9f15d 100644
--- a/MATLAB/mex_CUDA_win64_MVS2013.xml
+++ b/MATLAB/mex_CUDA_win64_MVS2013.xml
@@ -46,7 +46,7 @@
           LINKFLAGS="/nologo /manifest"
           LINKTYPE="/DLL "
           LINKEXPORT="/EXPORT:mexFunction"
-          LINKLIBS="/LIBPATH:&quot;$MATLABROOT\extern\lib\$ARCH\microsoft&quot; libmx.lib libmex.lib libmat.lib gpu.lib cudart.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib  ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib"
+          LINKLIBS="/LIBPATH:&quot;$MATLABROOT\extern\lib\$ARCH\microsoft&quot; libmx.lib libmex.lib libmat.lib gpu.lib cudart.lib cufft.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib  ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib"
           LINKDEBUGFLAGS="/debug /PDB:&quot;$TEMPNAME$LDEXT.pdb&quot;"
           LINKOPTIMFLAGS=""
           OBJEXT=".obj"
diff --git a/MATLAB/mex_CUDA_win64_MVS2015.xml b/MATLAB/mex_CUDA_win64_MVS2015.xml
index 0eda60bc..430efcf0 100644
--- a/MATLAB/mex_CUDA_win64_MVS2015.xml
+++ b/MATLAB/mex_CUDA_win64_MVS2015.xml
@@ -44,7 +44,7 @@
         LINKFLAGS="/nologo /manifest"
         LINKTYPE="/DLL "
         LINKEXPORT="/EXPORT:mexFunction"
-        LINKLIBS="/LIBPATH:&quot;$MATLABROOT\extern\lib\$ARCH\microsoft&quot; libmx.lib libmex.lib libmat.lib cudart.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib  ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib"
+        LINKLIBS="/LIBPATH:&quot;$MATLABROOT\extern\lib\$ARCH\microsoft&quot; libmx.lib libmex.lib libmat.lib cudart.lib cufft.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib  ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib"
         LINKDEBUGFLAGS="/debug /PDB:&quot;$TEMPNAME$LDEXT.pdb&quot;"
         LINKOPTIMFLAGS=""
         OBJEXT=".obj"
diff --git a/MATLAB/mex_CUDA_win64_MVS2022.xml b/MATLAB/mex_CUDA_win64_MVS2022.xml
index cc140d68..705e18a0 100644
--- a/MATLAB/mex_CUDA_win64_MVS2022.xml
+++ b/MATLAB/mex_CUDA_win64_MVS2022.xml
@@ -44,7 +44,7 @@
         LINKFLAGS="/nologo /manifest"
         LINKTYPE="/DLL "
         LINKEXPORT="/EXPORT:mexFunction"
-        LINKLIBS="/LIBPATH:&quot;$MATLABROOT\extern\lib\$ARCH\microsoft&quot; libmx.lib libmex.lib libmat.lib cudart.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib  ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib"
+        LINKLIBS="/LIBPATH:&quot;$MATLABROOT\extern\lib\$ARCH\microsoft&quot; libmx.lib libmex.lib libmat.lib cudart.lib cufft.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib  ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib"
         LINKDEBUGFLAGS="/debug /PDB:&quot;$TEMPNAME$LDEXT.pdb&quot;"
         LINKOPTIMFLAGS=""
         OBJEXT=".obj"
diff --git a/Python/setup.py b/Python/setup.py
index ad35b470..e61ac519 100644
--- a/Python/setup.py
+++ b/Python/setup.py
@@ -509,6 +509,26 @@ def include_headers(filename_list, sdist=False):
 )
 
 
+FbpFiltration_ext = Extension(
+    "_FbpFiltration",
+    sources=include_headers(
+        [
+            "../Common/CUDA/TIGRE_common.cpp",
+            "../Common/CUDA/FbpFiltration.cu",
+            "../Common/CUDA/GpuIds.cpp",
+            "../Common/CUDA/gpuUtils.cu",
+            "tigre/utilities/cuda_interface/_FbpFiltration.pyx",
+        ],
+        sdist=sys.argv[1] == "sdist",
+    ),
+    define_macros=[("IS_FOR_PYTIGRE", None)],
+    library_dirs=[CUDA["lib64"]],
+    libraries=["cudart", "cufft"],
+    language="c++",
+    runtime_library_dirs=[CUDA["lib64"]] if not IS_WINDOWS else None,
+    include_dirs=[NUMPY_INCLUDE, CUDA["include"], "../Common/CUDA/"],
+)
+
 setup(
     name="pytigre",
     version="2.4.0",
@@ -516,7 +536,7 @@ def include_headers(filename_list, sdist=False):
     packages=find_packages(),
     include_package_data=True,
     data_files=[("data", ["../Common/data/head.mat"])],
-    ext_modules=[Ax_ext, Atb_ext, tvdenoising_ext, minTV_ext, AwminTV_ext, gpuUtils_ext, RandomNumberGenerator_ext],
+    ext_modules=[Ax_ext, Atb_ext, tvdenoising_ext, minTV_ext, AwminTV_ext, gpuUtils_ext, RandomNumberGenerator_ext, FbpFiltration_ext],
     py_modules=["tigre.py"],
     cmdclass={"build_ext": BuildExtension},
     install_requires=["Cython", "matplotlib", "numpy", "scipy", "tqdm"],
diff --git a/Python/tigre/algorithms/single_pass_algorithms.py b/Python/tigre/algorithms/single_pass_algorithms.py
index 3fac0621..6110f735 100644
--- a/Python/tigre/algorithms/single_pass_algorithms.py
+++ b/Python/tigre/algorithms/single_pass_algorithms.py
@@ -73,6 +73,9 @@ def FDK(proj, geo, angles, **kwargs):
     verbose = kwargs["verbose"] if "verbose" in kwargs else False
 
     gpuids = kwargs["gpuids"] if "gpuids" in kwargs else None
+
+    use_gpu_for_filtration = kwargs["use_gpu_for_filtration"] if "use_gpu_for_filtration" in kwargs else 2
+
     geo = copy.deepcopy(geo)
     geo.check_geo(angles)
     geo.checknans()
@@ -88,7 +91,7 @@ def FDK(proj, geo, angles, **kwargs):
     w = geo.DSD[0] / np.sqrt((geo.DSD[0] ** 2 + xx ** 2 + yy ** 2))
     np.multiply(proj, w, out=proj_filt)
 
-    proj_filt = filtering(proj_filt, geo, angles, parker=False, verbose=verbose)
+    proj_filt = filtering(proj_filt, geo, angles, parker=False, verbose=verbose, use_gpu=use_gpu_for_filtration, gpuids=gpuids)
     
     return Atb(proj_filt, geo, geo.angles, "FDK", gpuids=gpuids)
 
diff --git a/Python/tigre/utilities/cuda_interface/_FbpFiltration.pyx b/Python/tigre/utilities/cuda_interface/_FbpFiltration.pyx
new file mode 100644
index 00000000..6705b219
--- /dev/null
+++ b/Python/tigre/utilities/cuda_interface/_FbpFiltration.pyx
@@ -0,0 +1,57 @@
+#   This file is part of the TIGRE Toolbox
+
+#   Copyright (c) 2015, University of Bath and
+#                       CERN-European Organization for Nuclear Research
+#                       All rights reserved.
+
+#   License:            Open Source under BSD.
+#                       See the full license at
+#                       https://github.com/CERN/TIGRE/license.txt
+
+#   Contact:            tigre.toolbox@gmail.com
+#   Codes:              https://github.com/CERN/TIGRE/
+# --------------------------------------------------------------------------
+#   Coded by:          Tomoyuki SADAKANE
+
+cimport numpy as np 
+import numpy as np
+from tigre.utilities.errors import TigreCudaCallError
+from tigre.utilities.cuda_interface._gpuUtils cimport GpuIds as c_GpuIds, convert_to_c_gpuids, free_c_gpuids
+
+np.import_array()
+
+from libc.stdlib cimport malloc, free 
+
+cdef extern from "numpy/arrayobject.h":
+    void PyArray_ENABLEFLAGS(np.ndarray arr, int flags)
+    void PyArray_CLEARFLAGS(np.ndarray arr, int flags)
+
+cdef extern from "FbpFiltration.hpp":
+    cdef void apply_filtration(const float* pf32In, size_t uiXLen, size_t uiYLen, float* pf32Filter, float* pfOut, const c_GpuIds gpuids)
+
+def cuda_raise_errors(error_code):
+    if error_code:
+        raise TigreCudaCallError('FFT::', error_code)
+
+def apply(np.ndarray[np.float32_t, ndim=2] src, np.ndarray[np.float32_t, ndim=1] flt, gpuids=None):
+    cdef c_GpuIds* c_gpuids = convert_to_c_gpuids(gpuids)
+    if not c_gpuids:
+        raise MemoryError()
+
+    cdef np.npy_intp size_img[2]
+    size_img[0]= <np.npy_intp> src.shape[0]
+    size_img[1]= <np.npy_intp> src.shape[1]
+
+    cdef float* c_imgout = <float*> malloc(size_img[0] *size_img[1]* sizeof(float))
+
+    src = np.ascontiguousarray(src)
+    flt = np.ascontiguousarray(flt)
+
+    cdef float* c_src = <float*> src.data
+    cdef float* c_flt = <float*> flt.data
+    cdef size_t uiXLen = size_img[1]
+    cdef size_t uiYLen = size_img[0]
+    cuda_raise_errors(apply_filtration(c_src, uiXLen, uiYLen, c_flt, c_imgout, c_gpuids[0]))
+    imgout = np.PyArray_SimpleNewFromData(2, size_img, np.NPY_FLOAT32, c_imgout)
+    PyArray_ENABLEFLAGS(imgout, np.NPY_OWNDATA)
+    return imgout
diff --git a/Python/tigre/utilities/filtering.py b/Python/tigre/utilities/filtering.py
index f7aedc17..08a2ffc1 100644
--- a/Python/tigre/utilities/filtering.py
+++ b/Python/tigre/utilities/filtering.py
@@ -5,14 +5,17 @@
 import numpy as np
 from scipy.fft  import fft, ifft
 
+import matplotlib.pyplot as plt
+
 import warnings
 
 import numpy as np
 from tigre.utilities.parkerweight import parkerweight
+import _FbpFiltration as fbpfiltration
 
 
 # TODO: Fix parker
-def filtering(proj, geo, angles, parker, verbose=False):
+def filtering(proj, geo, angles, parker, verbose=False, use_gpu=True, gpuids=None):
     if parker:
         proj=parkerweight(proj.transpose(0,2,1),geo,angles,parker).transpose(0,2,1)
 
@@ -21,34 +24,50 @@ def filtering(proj, geo, angles, parker, verbose=False):
 
     d=1
     filt=filter(geo.filter,ramp_kernel[0],filt_len,d,verbose=verbose)
-    filt=np.kron(np.ones((np.int64(geo.nDetector[0]),1),dtype=np.float32),filt)
 
     padding = int((filt_len-geo.nDetector[1])//2 )
     scale_factor = (geo.DSD[0]/geo.DSO[0]) * (2 * np.pi/ len(angles)) / ( 4 * geo.dDetector[0] ) 
-
-    #filter 2 projection at a time packing in to complex container
-    fproj=np.empty((geo.nDetector[0],filt_len),dtype=np.complex64)
-    for i in range(0,angles.shape[0]-1,2):
-        fproj.fill(0)
-        fproj.real[:,padding:padding+geo.nDetector[1]]=proj[i]
-        fproj.imag[:,padding:padding+geo.nDetector[1]]=proj[i+1]
-
-        fproj=fft(fproj,axis=1)
-        fproj=fproj*filt
-        fproj=ifft(fproj,axis=1)
-
-        proj[i]=fproj.real[:,padding:padding+geo.nDetector[1]] * scale_factor
-        proj[i+1]=fproj.imag[:,padding:padding+geo.nDetector[1]] * scale_factor
-
-    #if odd number of projections filter last solo
-    if angles.shape[0] % 2:
-        fproj.fill(0)
-        fproj.real[:,padding:padding+geo.nDetector[1]]=proj[angles.shape[0]-1]
-
-        fproj=fft(fproj,axis=1)
-        fproj=fproj*filt
-        fproj=np.real(ifft(fproj,axis=1))     
-        proj[angles.shape[0]-1]=fproj[:,padding:padding+geo.nDetector[1]] * scale_factor
+    if use_gpu==2:
+        bundle_size = 32#len(gpuids)
+        n_bundles = (angles.shape[0]+bundle_size-1)//bundle_size
+        for idx_bundle in range(0,n_bundles):
+            bundle_size_actual = bundle_size if idx_bundle != n_bundles-1 else angles.shape[0]-idx_bundle*bundle_size
+            idx_begin = idx_bundle*bundle_size
+            idx_end = idx_bundle*bundle_size+bundle_size_actual
+            proj[idx_begin:idx_end] = fbpfiltration.apply_padding_and_filtering(proj, idx_begin, idx_end, filt, scale_factor, gpuids) 
+    elif use_gpu==1:
+        fproj=np.empty((geo.nDetector[0],filt_len),dtype=np.float32)
+        for idx_proj in range(0,angles.shape[0]):
+            fproj.fill(0)
+            fproj[:,padding:padding+geo.nDetector[1]]=proj[idx_proj]
+            fproj = fbpfiltration.apply(fproj, filt, gpuids)
+            proj[idx_proj]=fproj[:,padding:padding+geo.nDetector[1]] * scale_factor
+    else:
+        filt=np.kron(np.ones((np.int64(geo.nDetector[0]),1),dtype=np.float32),filt)
+
+        #filter 2 projection at a time packing in to complex container
+        fproj=np.empty((geo.nDetector[0],filt_len),dtype=np.complex64)
+        for i in range(0,angles.shape[0]-1,2):
+            fproj.fill(0)
+            fproj.real[:,padding:padding+geo.nDetector[1]]=proj[i]
+            fproj.imag[:,padding:padding+geo.nDetector[1]]=proj[i+1]
+
+            fproj=fft(fproj,axis=1)
+            fproj=fproj*filt
+            fproj=ifft(fproj,axis=1)
+
+            proj[i]=fproj.real[:,padding:padding+geo.nDetector[1]] * scale_factor
+            proj[i+1]=fproj.imag[:,padding:padding+geo.nDetector[1]] * scale_factor
+
+        #if odd number of projections filter last solo
+        if angles.shape[0] % 2:
+            fproj.fill(0)
+            fproj.real[:,padding:padding+geo.nDetector[1]]=proj[angles.shape[0]-1]
+
+            fproj=fft(fproj,axis=1)
+            fproj=fproj*filt
+            fproj=np.real(ifft(fproj,axis=1))     
+            proj[angles.shape[0]-1]=fproj[:,padding:padding+geo.nDetector[1]] * scale_factor
 
     return proj