RARP/Video3D/i3ddemo.ipynb at b10ac1df667a8a463e659bdbe74e3c8fe597adb0

Fork: 0
diego / RARP
Find file
Newer
Older
RARP / Video3D / i3ddemo.ipynb
delAguila on 22 Nov 2024 15 KB init comit
Raw Blame History
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import decord\n",
    "from pathlib import Path\n",
    "import torch\n",
    "import cv2\n",
    "import numpy as np\n",
    "from inflateDEMO import I3DResNet50\n",
    "import torchvision.transforms as T\n",
    "import sys\n",
    "from tqdm.notebook import tqdm\n",
    "\n",
    "try:\n",
    "    print (sys.path.index(\"d:\\\\Users\\\\user\\\\Documents\\\\postata\\\\RARP\\\\Clasification\"))\n",
    "except:\n",
    "    sys.path.append(\"d:\\\\Users\\\\user\\\\Documents\\\\postata\\\\RARP\\\\Clasification\")\n",
    "    \n",
    "print(sys.path)\n",
    "from Models import RARP_NVB_ResNet50\n",
    "\n",
    "\n",
    "def _removeBlackBorder(image):\n",
    "    image = np.array(image)\n",
    "    \n",
    "    copyImg = cv2.cvtColor(image.copy(), cv2.COLOR_BGR2HSV)\n",
    "    h = copyImg[:,:,0]\n",
    "    mask = np.ones(h.shape, dtype=np.uint8) * 255\n",
    "    th = (25, 175)\n",
    "    mask[(h > th[0]) & (h < th[1])] = 0\n",
    "    copyImg = cv2.cvtColor(copyImg, cv2.COLOR_HSV2BGR)\n",
    "    resROI = cv2.bitwise_and(copyImg, copyImg, mask=mask)\n",
    "        \n",
    "    image_gray = cv2.cvtColor(resROI, cv2.COLOR_BGR2GRAY)\n",
    "    _, thresh = cv2.threshold(image_gray, 0, 255, cv2.THRESH_BINARY)\n",
    "    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (15, 15))\n",
    "    morph = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel)\n",
    "    contours = cv2.findContours(morph, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)\n",
    "    contours = contours[0] if len(contours) == 2 else contours[1]\n",
    "    bigCont = max(contours, key=cv2.contourArea)\n",
    "    x, y, w, h = cv2.boundingRect(bigCont)\n",
    "    crop = image[y : y + h, x : x + w]\n",
    "    return crop\n",
    "\n",
    "def VideoPrePros_BackBars(VideoIN_Path:str, VideoOUT_Path:str, demoVideo:int = None):\n",
    "    size = (1269, 1007)\n",
    "    fourcc = cv2.VideoWriter_fourcc(*'mp4v')\n",
    "    video = cv2.VideoWriter(VideoOUT_Path, fourcc, 30, size)\n",
    "    \n",
    "    if demoVideo is not None:\n",
    "        demoVideoOut = cv2.VideoWriter(\"demo.mp4\", fourcc, demoVideo, size)\n",
    "\n",
    "    cap = cv2.VideoCapture(VideoIN_Path)\n",
    "    while (True):\n",
    "        ret, frame = cap.read()\n",
    "        if not ret:\n",
    "            break\n",
    "        \n",
    "        img = _removeBlackBorder(frame)\n",
    "        img = cv2.resize(img, size, interpolation=cv2.INTER_CUBIC)\n",
    "        video.write(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))\n",
    "        #video.write(img)\n",
    "        if demoVideo is not None:\n",
    "            demoVideoOut.write(img)\n",
    "        \n",
    "    video.release()\n",
    "    if demoVideo is not None:\n",
    "        demoVideoOut.release()\n",
    "    cap.release()\n",
    "    \n",
    "RN50Model = RARP_NVB_ResNet50.load_from_checkpoint(\"../log_ResNet50_X10/lightning_logs/version_8/checkpoints/RARP-epoch=5.ckpt\")\n",
    "RN50ModelToEval = RARP_NVB_ResNet50.load_from_checkpoint(\"../log_ResNet50_X10/lightning_logs/version_8/checkpoints/RARP-epoch=5.ckpt\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import ffmpeg\n",
    "\n",
    "def extract_frames_ffmpeg(video_path, start_frame, end_frame, width=None, height=None, fps=30):\n",
    "    # Construir el comando ffmpeg\n",
    "    stream = (\n",
    "        ffmpeg\n",
    "        .input(video_path, ss=start_frame / fps, t=end_frame / fps, hwaccel='cuda')  # Suponiendo 30 fps, ajusta según el FPS del video\n",
    "        .output('pipe:', format='rawvideo', pix_fmt='rgb24')\n",
    "        .run(capture_stdout=True)\n",
    "    )\n",
    "    \n",
    "    # Convertir el stream en un array de NumPy\n",
    "    video = np.frombuffer(stream[0], np.uint8)\n",
    "    # Asumiendo que conocemos el ancho y el alto del video\n",
    "    if width and height:\n",
    "        video = video.reshape((-1, height, width, 3))  # Num_frames, altura, anchura, canales\n",
    "    return video"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "videoPathLong = Path(\"./dataset/NVB/350Full.mp4\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "decord.bridge.set_bridge('native')\n",
    "vr = decord.VideoReader(str(videoPathLong.absolute()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "from nested_lookup import nested_lookup\n",
    "\n",
    "viodeoInfo = ffmpeg.probe(str(videoPathLong.absolute()))\n",
    "fps =  eval (nested_lookup(\"avg_frame_rate\", viodeoInfo)[0])\n",
    "w = int(nested_lookup(\"width\", viodeoInfo)[0])\n",
    "h = int(nested_lookup(\"height\", viodeoInfo)[0])\n",
    "total_frames = int(nested_lookup(\"nb_frames\", viodeoInfo)[0])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "total_frames"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "def show_picture(im):\n",
    "    plt.imshow(im)\n",
    "    plt.xticks([])\n",
    "    plt.yticks([])\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fps = vr.get_avg_fps()\n",
    "print (f\"FPS:{vr.get_avg_fps()}\")\n",
    "segs = len(vr)/fps\n",
    "print (f\"Video Length: {segs} seg.\")\n",
    "print (f\"Video Length: {round(segs*fps)} frames.\")\n",
    "print (f\"Chuks of 15 seg: {segs//15} chunks\")\n",
    "print (f\"Chuks of 30 seg: {segs//30} chunks\")\n",
    "print (f\"Chuks of 35 seg: {segs//35} chunks\")\n",
    "print (f\"Chuks of 40 seg: {segs//40} chunks\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "start_frame = 0  # Fotograma de inicio\n",
    "num_frames = 450  # Número de fotogramas que quieres extraer\n",
    "  # Ancho y alto del video (ajusta según el video)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "frames_batch = extract_frames_ffmpeg(str(videoPathLong.absolute()), start_frame, num_frames, width=w, height=h, fps=fps)\n",
    "\n",
    "print(f'Frames extraídos: {frames_batch.shape}')\n",
    "\n",
    "bgrFramesBatch = frames_batch[..., ::-1].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "chunk_frames = vr.get_batch(range(start_frame, num_frames)).asnumpy()\n",
    "        \n",
    "chunk_frames_bgr = chunk_frames[..., ::-1].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "show_picture(chunk_frames_bgr[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "mean, std = ([30.38144216, 42.03988769, 97.8896116], [40.63141752, 44.26910074, 50.29294373])\n",
    "transforms = T.Compose([\n",
    "    T.Resize((256,256), antialias=True, interpolation=T.InterpolationMode.BICUBIC),\n",
    "    T.CenterCrop(224),\n",
    "    T.Normalize(mean, std)\n",
    "])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "frameToFind = cv2.imread(str(Path(\"../DataSet_Ando_All/NVB/350.tiff\")), cv2.IMREAD_COLOR)\n",
    "#frameToFind = _removeBlackBorder(frameToFind)\n",
    "frameToFind = torch.Tensor(frameToFind)\n",
    "frameToFind = frameToFind.permute(2, 0, 1).float()\n",
    "\n",
    "frameToFind = transforms(frameToFind)\n",
    "\n",
    "frameToFind = frameToFind.repeat(1, 1, 1, 1)\n",
    "\n",
    "torch.set_float32_matmul_precision('medium')\n",
    "torch.backends.cudnn.deterministic = True\n",
    "\n",
    "device = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")\n",
    "\n",
    "InfalteModel = I3DResNet50(RN50Model.model).to(device)\n",
    "InfalteModel.fc = torch.nn.Identity()\n",
    "InfalteModel.eval()\n",
    "\n",
    "RN50ModelToEval.model.fc = torch.nn.Identity()\n",
    "RN50ModelToEval.to(device)\n",
    "RN50ModelToEval.eval()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "framesChunk = np.array([15, 30, 35, 40]) * round(fps)\n",
    "\n",
    "chunk_size = framesChunk[0]\n",
    "total_frames = len(vr)\n",
    "\n",
    "with torch.no_grad():\n",
    "    frameToFind = frameToFind.to(device)\n",
    "    \n",
    "    Doutput = RN50ModelToEval(frameToFind)\n",
    "    Doutput = Doutput.squeeze()\n",
    "    \n",
    "    maxSim = 0\n",
    "    \n",
    "    ListSim = []\n",
    "    \n",
    "    initFrame = None\n",
    "    \n",
    "    for start_idx in tqdm(range(0, total_frames, chunk_size)):\n",
    "        end_idx = min(start_idx + chunk_size, total_frames)\n",
    "        chunk_frames = vr.get_batch(range(start_idx, end_idx)).asnumpy()\n",
    "        \n",
    "        chunk_frames_bgr = chunk_frames[..., ::-1].copy()\n",
    "        \n",
    "        frames = torch.from_numpy(chunk_frames_bgr).to(device)\n",
    "        frames = frames.permute(0, 3, 1, 2)\n",
    "        frames = frames.float()\n",
    "\n",
    "        frames = transforms(frames)\n",
    "\n",
    "        frames = frames.repeat(1, 1, 1, 1, 1)\n",
    "        frames = frames.permute(0, 2, 1, 3, 4)\n",
    "        \n",
    "        outPut = InfalteModel(frames)\n",
    "        outPut = outPut.squeeze()\n",
    "        \n",
    "        #print(outPut.shape, Doutput.shape)\n",
    "           \n",
    "        cos_sim = torch.nn.functional.cosine_similarity(outPut, Doutput, dim=0)\n",
    "        \n",
    "        #maxSim = cos_sim if cos_sim > maxSim else maxSim\n",
    "        if cos_sim > maxSim:\n",
    "            print(cos_sim)\n",
    "            maxSim = cos_sim\n",
    "            initFrame = (start_idx, end_idx)\n",
    "            \n",
    "\n",
    "        ListSim.append(cos_sim)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "def seconds_to_hms(seconds):\n",
    "    hours = seconds // 3600\n",
    "    minutes = (seconds % 3600) // 60\n",
    "    secs = seconds % 60\n",
    "    return f'{int(hours)}:{int(minutes):02}:{int(secs):02}'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(seconds_to_hms(initFrame[0]/fps), seconds_to_hms(initFrame[1]/fps))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "import ffmpeg\n",
    "from pathlib import Path\n",
    "\n",
    "videoPathLong = Path(\"./dataset/NVB/350Full.mp4\")\n",
    "fileVideo = ffmpeg.input(str(videoPathLong.absolute()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "outPutVideo = ffmpeg.output(fileVideo.trim(start_frame=initFrame[0], end_frame=initFrame[1]), \"350_1.mp4\", vcodec='h264_nvenc')\n",
    "ffmpeg.run(outPutVideo)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "from scipy.signal import savgol_filter\n",
    "\n",
    "Name = \"NVB/350.mp4\"\n",
    "chunks = range(0, total_frames, chunk_size)\n",
    "sim = torch.tensor(ListSim)\n",
    "\n",
    "smoothed_accuracy = savgol_filter(sim, window_length=5, polyorder=2)\n",
    "\n",
    "maxSim = sim.max()\n",
    "indexmax = sim.argmax()\n",
    "bestSim = chunks[indexmax]\n",
    "\n",
    "plt.figure(figsize=(15, 6))\n",
    "plt.plot(chunks, sim, marker='o', linestyle='-', color='b', label='Cosine similarity')\n",
    "plt.plot(chunks, smoothed_accuracy, color='g', linestyle='--', label='Smoothed Data')\n",
    "plt.title(f\"frames vs. Sim. [{Name}]\")\n",
    "plt.xlabel('frames')\n",
    "plt.ylabel('Cos Sim.')\n",
    "plt.grid(True)\n",
    "\n",
    "plt.scatter(bestSim, maxSim, zorder=5, marker=\"x\", color='r', label=f'Higth Sim: {maxSim:.4f} at {bestSim} start frame')\n",
    "plt.legend()\n",
    "\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "from scipy.signal import savgol_filter\n",
    "\n",
    "# Data from the table\n",
    "Name = \"NVB/349.mp4\"\n",
    "frames = [19,24,31,47,52,58,59,66,72,73,76,77,93,116,154,231,461]\n",
    "accuracy = [0.7166,0.7372,0.7518,0.7503,0.6988,0.713,0.7233,0.6932,0.6776,0.6842,0.6858,0.689,0.6887,0.6643,0.6811,0.6605,0.6827]\n",
    "\n",
    "smoothed_accuracy = savgol_filter(accuracy, window_length=5, polyorder=2)\n",
    "\n",
    "\n",
    "# Finding the best accuracy and corresponding number of frames\n",
    "max_acc = max(accuracy)\n",
    "max_acc_index = accuracy.index(max_acc)\n",
    "best_frames = frames[max_acc_index]\n",
    "\n",
    "# Plotting the data\n",
    "plt.figure(figsize=(8, 6))\n",
    "plt.plot(frames, accuracy, marker='o', linestyle='-', color='b', label='Original Data')\n",
    "plt.plot(frames, smoothed_accuracy, color='g', linestyle='--', label='Smoothed Data')\n",
    "plt.title(f\"No. Frames vs. Prob. NVB [{Name}]\")\n",
    "plt.xlabel('No. Frames')\n",
    "plt.ylabel('Prob. NVB')\n",
    "plt.grid(True)\n",
    "\n",
    "# Highlight the best accuracy point\n",
    "plt.scatter(best_frames, max_acc, zorder=5, marker=\"x\", color='r', label=f'Best Acc: {max_acc} at {best_frames} Frames')\n",
    "plt.legend()\n",
    "\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "from scipy.signal import savgol_filter\n",
    "\n",
    "# Data from the table\n",
    "Name = \"NOT_NVB/201.mp4\"\n",
    "frames = [18,23,30,45,50,57,59,65,72,73,75,76,90,113,150,224,450]\n",
    "accuracy = [0.4945,0.5124,0.5194,0.5173,0.501,0.5206,0.5222,0.5322,0.496,0.5042,0.5007,0.4977,0.5042,0.5139,0.51,0.5116,0.5123]\n",
    "\n",
    "smoothed_accuracy = savgol_filter(accuracy, window_length=5, polyorder=2)\n",
    "\n",
    "\n",
    "# Finding the best accuracy and corresponding number of frames\n",
    "max_acc = min(accuracy)\n",
    "max_acc_index = accuracy.index(max_acc)\n",
    "best_frames = frames[max_acc_index]\n",
    "\n",
    "# Plotting the data\n",
    "plt.figure(figsize=(8, 6))\n",
    "plt.plot(frames, accuracy, marker='o', linestyle='-', color='r', label='Original Data')\n",
    "plt.plot(frames, smoothed_accuracy, color='g', linestyle='--', label='Smoothed Data')\n",
    "plt.title(f\"No. Frames vs. Prob. NVB [{Name}]\")\n",
    "plt.xlabel('No. Frames')\n",
    "plt.ylabel('Prob. NVB')\n",
    "plt.grid(True)\n",
    "\n",
    "# Highlight the best accuracy point\n",
    "plt.scatter(best_frames, max_acc, zorder=5, marker=\"x\", color='b', label=f'Best Acc: {max_acc} at {best_frames} Frames')\n",
    "plt.legend()\n",
    "\n",
    "plt.show()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "pyRARP",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}