TensorRT Export & Performance

### Search before asking

- [x] I have searched the RF-DETR issues and found no similar bug report.


### Bug

What is the correct method for exporting the RF-DETR segmentation models to TensorRT? Specifically trying nano and small, my exported models (both with and without fp16, and with and without simplified onnx) perform extremely poorly getting confidence below 0.1 for persons detected.

I can see on the benchmarks on the website you have used tensorrt fp16, so how was this done while maintaining accuracy and performance? I've attached the script ive created for exporting to tensorrt currently.

### Environment

RF-DETR Version: 1.4.1
OS: WSL2 Ubuntu: 24.04
Python version: 3.11.14
PyTorch Version: 2.8.0
Cuda version: 13.0
GPU: RTX 3080

### Minimal Reproducible Example

```
import sys
import argparse
import os
import subprocess
from pathlib import Path
from copy import deepcopy

import torch

# Add rf-detr to path if needed
project_root = Path(__file__).parent.parent.absolute()
rf_detr_path = project_root / "rf-detr"
if str(rf_detr_path) not in sys.path:
    sys.path.insert(0, str(rf_detr_path))
    print(f"Added {rf_detr_path} to sys.path")


def trtexec_fp32(onnx_path: str, verbose: bool = False) -> str:
    """
    Build TensorRT engine using trtexec (FP32, no fp16).
    Uses the same approach as rfdetr/deploy/export.py but without --fp16.
    """
    engine_path = onnx_path.replace(".onnx", ".engine")
    
    # Build trtexec command - NO --fp16 flag for FP32
    command = " ".join([
        "trtexec",
        f"--onnx={onnx_path}",
        f"--saveEngine={engine_path}",
        "--memPoolSize=workspace:4096",  # No --fp16
        "--useCudaGraph --useSpinWait --warmUp=500 --avgRuns=1000 --duration=10",
        "--verbose" if verbose else ""
    ])
    
    print(f"Running trtexec command:\n{command}\n")
    
    result = subprocess.run(command, shell=True, capture_output=True, text=True)
    print(result.stdout)
    if result.returncode != 0:
        print(f"TensorRT build failed:\n{result.stderr}")
        raise RuntimeError("trtexec failed")
    
    print(f"TensorRT engine saved to: {engine_path}")
    return engine_path


def main():
    parser = argparse.ArgumentParser(description="Export RF-DETR model using official API")
    parser.add_argument("--output_dir", type=str, default="export_output_official", help="Output directory")
    parser.add_argument("--simplify", action="store_true", default=False, help="Simplify ONNX model (default: False)")
    parser.add_argument("--tensorrt", action="store_true", help="Build TensorRT engine after ONNX export")
    parser.add_argument("--fp16", action="store_true", help="Use FP16 for TensorRT (default: FP32)")
    parser.add_argument("--model", type=str, default="small", 
                        choices=["nano", "small", "medium", "large"],
                        help="Model size to export (default: small)")
    parser.add_argument("--verbose", action="store_true", help="Verbose output")
    
    args = parser.parse_args()
    
    # Import the appropriate model class
    print(f"Loading RFDETRSeg{args.model.capitalize()} model...")
    
    if args.model == "nano":
        from rfdetr import RFDETRSegNano
        model = RFDETRSegNano()
    elif args.model == "small":
        from rfdetr import RFDETRSegSmall
        model = RFDETRSegSmall()
    elif args.model == "medium":
        from rfdetr import RFDETRSegMedium
        model = RFDETRSegMedium()
    elif args.model == "large":
        from rfdetr import RFDETRSegLarge
        model = RFDETRSegLarge()
    
    print(f"Model loaded. Resolution: {model.model_config.resolution}")
    print(f"Pretrain weights: {model.model_config.pretrain_weights}")
    
    # Export using lower-level API (workaround for segmentation bug in Model.export())
    from rfdetr.deploy.export import export_onnx, make_infer_image, onnx_simplify, trtexec
    
    output_dir = Path(args.output_dir)
    os.makedirs(output_dir, exist_ok=True)
    
    resolution = model.model_config.resolution
    shape = (resolution, resolution)
    
    # Prepare model for export
    inner_model = deepcopy(model.model.model)  # Get the nn.Module
    inner_model.eval()
    if hasattr(inner_model, 'export'):
        inner_model.export()  # Switch to export mode
    
    # Create input tensor
    input_tensors = make_infer_image(None, shape, 1, "cpu")
    inner_model.cpu()
    
    # Define export configuration
    input_names = ['input']
    output_names = ['dets', 'labels', 'masks']  # Segmentation model outputs
    
    print(f"\nExporting to ONNX in '{output_dir}'...")
    output_file = export_onnx(
        output_dir=str(output_dir),
        model=inner_model,
        input_names=input_names,
        input_tensors=input_tensors,
        output_names=output_names,
        dynamic_axes=None,
        backbone_only=False,
        verbose=args.verbose,
        opset_version=17
    )
    
    print(f"ONNX export complete: {output_file}")
    
    # Simplify if requested
    if args.simplify:
        print("\nSimplifying ONNX model...")
        output_file = onnx_simplify(
            onnx_dir=output_file,
            input_names=input_names,
            input_tensors=input_tensors,
            force=True
        )
        print(f"Simplified ONNX: {output_file}")
    
    # Build TensorRT engine if requested
    if args.tensorrt:
        print("\nBuilding TensorRT engine...")
        
        if args.fp16:
            # Use the library's trtexec (has --fp16 hardcoded)
            class TRTArgs:
                verbose = args.verbose
                profile = False
                dry_run = False
            trtexec(output_file, TRTArgs())
        else:
            # Use our custom FP32 version
            trtexec_fp32(output_file, verbose=args.verbose)
        
        print("\nTensorRT engine build complete!")
    
    print("\nDone!")

if __name__ == "__main__":
    main()
```


### Additional

_No response_

### Are you willing to submit a PR?

- [ ] Yes, I'd like to help by submitting a PR!

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

TensorRT Export & Performance #649

Search before asking

Bug

Environment

Minimal Reproducible Example

Additional

Are you willing to submit a PR?

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

TensorRT Export & Performance #649

Description

Search before asking

Bug

Environment

Minimal Reproducible Example

Additional

Are you willing to submit a PR?

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions