import decord
from pathlib import Path
import torch
import cv2
import numpy as np
from inflateDEMO import I3DResNet50
import torchvision.transforms as T
import sys
from tqdm import tqdm
import ffmpeg
import argparse
from nested_lookup import nested_lookup
import gc
try:
print (sys.path.index("d:\\Users\\user\\Documents\\postata\\RARP\\Clasification"))
except:
sys.path.append("d:\\Users\\user\\Documents\\postata\\RARP\\Clasification")
from Models import RARP_NVB_ResNet50
def extract_frames_ffmpeg(video_path, start_frame, end_frame, width=None, height=None, fps=30):
# Construir el comando ffmpeg
stream = (
ffmpeg
.input(video_path, ss=start_frame / fps, t=end_frame / fps, hwaccel='cuda') # Suponiendo 30 fps, ajusta segĂșn el FPS del video
.output('pipe:', format='rawvideo', pix_fmt='rgb24')
.run(capture_stdout=True)
)
# Convertir el stream en un array de NumPy
video = np.frombuffer(stream[0], np.uint8)
# Asumiendo que conocemos el ancho y el alto del video
if width and height:
video = video.reshape((-1, height, width, 3)) # Num_frames, altura, anchura, canales
del stream
gc.collect()
return video
def ffmpegVideoInfo (VideoPath:Path):
viodeoInfo = ffmpeg.probe(str(VideoPath.absolute()))
fps = eval (nested_lookup("avg_frame_rate", viodeoInfo)[0])
w = int(nested_lookup("width", viodeoInfo)[0])
h = int(nested_lookup("height", viodeoInfo)[0])
total_frames = int(nested_lookup("nb_frames", viodeoInfo)[0])
return (fps, (w, h), total_frames, None)
def decordVideoInfo (VideoPath:Path):
decord.bridge.set_bridge('native')
vr = decord.VideoReader(str(VideoPath.absolute()))
fps = vr.get_avg_fps()
total_frames = len(vr)
return (fps, None, total_frames, vr)
def seconds_to_hms(seconds):
hours = seconds // 3600
minutes = (seconds % 3600) // 60
secs = seconds % 60
return f'{int(hours)}:{int(minutes):02}:{int(secs):02}'
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--BaseLib", default="ffmpeg", type=str)
parser.add_argument("-i", "--Input", type=str)
parser.add_argument("-o", "--Output", type=str)
parser.add_argument("-t", "--Target", type=str)
parser.add_argument("-c", "--Chunk", type=int, default=15)
parser.add_argument("-b", "--BaseModel", type=str)
args = parser.parse_args()
RN50Model = RARP_NVB_ResNet50.load_from_checkpoint(args.BaseModel)
RN50ModelToEval = RARP_NVB_ResNet50.load_from_checkpoint(args.BaseModel)
mean, std = ([30.38144216, 42.03988769, 97.8896116], [40.63141752, 44.26910074, 50.29294373])
transforms = T.Compose([
T.Resize((256,256), antialias=True, interpolation=T.InterpolationMode.BICUBIC),
T.CenterCrop(224),
T.Normalize(mean, std)
])
frameToFind = cv2.imread(str(Path(args.Target)), cv2.IMREAD_COLOR)
frameToFind = torch.Tensor(frameToFind)
frameToFind = frameToFind.permute(2, 0, 1).float()
frameToFind = transforms(frameToFind)
frameToFind = frameToFind.repeat(1, 1, 1, 1)
torch.set_float32_matmul_precision('medium')
torch.backends.cudnn.deterministic = True
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
InfalteModel = I3DResNet50(RN50Model.model).to(device)
InfalteModel.fc = torch.nn.Identity()
InfalteModel.eval()
RN50ModelToEval.model.fc = torch.nn.Identity()
RN50ModelToEval.to(device)
RN50ModelToEval.eval()
chunk_size = args.Chunk
with torch.no_grad():
frameToFind = frameToFind.to(device)
Doutput = RN50ModelToEval(frameToFind)
Doutput = Doutput.squeeze()
maxSim = 0
ListSim = []
initFrame = None
videoPathLong = Path(args.Input)
fps, size, total_frames, vr = ffmpegVideoInfo(videoPathLong) if args.BaseLib == "ffmpeg" else decordVideoInfo(videoPathLong)
for start_idx in tqdm(range(0, total_frames, chunk_size)):
end_idx = min(start_idx + chunk_size, total_frames)
chunk_frames = extract_frames_ffmpeg(str(videoPathLong.absolute()), start_idx, end_idx, width=size[0], height=size[1], fps=fps) \
if args.BaseLib == "ffmpeg" else vr.get_batch(range(start_idx, end_idx)).asnumpy()
chunk_frames_bgr = chunk_frames[..., ::-1].copy()
frames = torch.from_numpy(chunk_frames_bgr).to(device)
frames = frames.permute(0, 3, 1, 2)
frames = frames.float()
frames = transforms(frames)
frames = frames.repeat(1, 1, 1, 1, 1)
frames = frames.permute(0, 2, 1, 3, 4)
outPut = InfalteModel(frames)
outPut = outPut.squeeze()
cos_sim = torch.nn.functional.cosine_similarity(outPut, Doutput, dim=0)
#maxSim = cos_sim if cos_sim > maxSim else maxSim
if cos_sim > maxSim:
#print(cos_sim)
maxSim = cos_sim
initFrame = (start_idx, end_idx)
del frames
del chunk_frames
gc.collect()
ListSim.append(cos_sim)
print(seconds_to_hms(initFrame[0]/fps), seconds_to_hms(initFrame[1]/fps))