Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
<a href="https://arxiv.org/abs/2401.10891"><img src='https://img.shields.io/badge/arXiv-Depth Anything-red' alt='Paper PDF'></a>
<a href='https://depth-anything.github.io'><img src='https://img.shields.io/badge/Project_Page-Depth Anything-green' alt='Project Page'></a>
<a href='https://huggingface.co/spaces/LiheYoung/Depth-Anything'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue'></a>
[![Replicate](https://replicate.com/cjwbw/depth-anything/badge)](https://replicate.com/cjwbw/depth-anything)
</div>

This work presents Depth Anything, a highly practical solution for robust monocular depth estimation by training on a combination of 1.5M labeled images and **62M+ unlabeled images**.
Expand Down
16 changes: 16 additions & 0 deletions cog.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# Configuration for Cog ⚙️
# Reference: https://github.com/replicate/cog/blob/main/docs/yaml.md

build:
gpu: true
system_packages:
- "libgl1-mesa-glx"
- "libglib2.0-0"
python_version: "3.11"
python_packages:
- "opencv-python==4.9.0.80"
- "torch==2.0.1"
- "torchvision==0.15.2"
- "tqdm==4.66.1"
- "huggingface_hub==0.20.3"
predict: "predict.py:Predictor"
85 changes: 85 additions & 0 deletions predict.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
# Prediction interface for Cog ⚙️
# https://github.com/replicate/cog/blob/main/docs/python.md

import cv2
import numpy as np
import torch
import torch.nn.functional as F
from torchvision.transforms import Compose
from cog import BasePredictor, Input, Path

from depth_anything.dpt import DepthAnything
from depth_anything.util.transform import Resize, NormalizeImage, PrepareForNet


class Predictor(BasePredictor):
def setup(self) -> None:
"""Load the model into memory to make running multiple predictions efficient"""
encoder_options = ["vits", "vitb", "vitl"]
self.device = "cuda:0"
model_cache = "model_cache"
self.models = {
k: DepthAnything.from_pretrained(
f"LiheYoung/depth_anything_{k}14", cache_dir=model_cache
).to(self.device)
for k in encoder_options
}
self.total_params = {
k: sum(param.numel() for param in self.models[k].parameters())
for k in encoder_options
}

self.transform = Compose(
[
Resize(
width=518,
height=518,
resize_target=False,
keep_aspect_ratio=True,
ensure_multiple_of=14,
resize_method="lower_bound",
image_interpolation_method=cv2.INTER_CUBIC,
),
NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
PrepareForNet(),
]
)

def predict(
self,
image: Path = Input(description="Input image"),
encoder: str = Input(
description="Choose an encoder.",
default="vitl",
choices=["vits", "vitb", "vitl"],
),
) -> Path:
"""Run a single prediction on the model"""
depth_anything = self.models[encoder]
total_params = self.total_params[encoder]
print("Total parameters: {:.2f}M".format(total_params / 1e6))

depth_anything.eval()

raw_image = cv2.imread(str(image))
image = cv2.cvtColor(raw_image, cv2.COLOR_BGR2RGB) / 255.0

h, w = image.shape[:2]

image = self.transform({"image": image})["image"]
image = torch.from_numpy(image).unsqueeze(0).to(self.device)

with torch.no_grad():
depth = depth_anything(image)

depth = F.interpolate(
depth[None], (h, w), mode="bilinear", align_corners=False
)[0, 0]
depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0

depth = depth.cpu().numpy().astype(np.uint8)
depth_color = cv2.applyColorMap(depth, cv2.COLORMAP_INFERNO)
output_path = "/tmp/out.png"
cv2.imwrite(output_path, depth_color)

return Path(output_path)