From d9074344e8ca6fa700eedc03df9561cbf35ec697 Mon Sep 17 00:00:00 2001
From: leonardozcm <leonardo1997zcm@gmail.com>
Date: Thu, 8 Sep 2022 10:18:27 +0800
Subject: [PATCH 1/2] cpu

---
 .gitignore                        |   5 ++
 configs/Kinetics/C2D_8x8_R50.yaml |   9 ++-
 install_requirements.sh           |  10 +++
 setup.py                          |   2 +-
 slowfast/datasets/decoder.py      | 118 ++++++++++++++++++++----------
 slowfast/utils/misc.py            |   2 +
 tools/train_net.py                |   5 +-
 7 files changed, 107 insertions(+), 44 deletions(-)
 create mode 100644 .gitignore
 create mode 100644 install_requirements.sh

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 000000000..6054951c2
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,5 @@
+*.pyc
+*.log
+build/*
+slowfast.egg-info/*
+*.zip
diff --git a/configs/Kinetics/C2D_8x8_R50.yaml b/configs/Kinetics/C2D_8x8_R50.yaml
index b0363d421..7668f4fb9 100644
--- a/configs/Kinetics/C2D_8x8_R50.yaml
+++ b/configs/Kinetics/C2D_8x8_R50.yaml
@@ -1,7 +1,7 @@
 TRAIN:
   ENABLE: True
   DATASET: kinetics
-  BATCH_SIZE: 64
+  BATCH_SIZE: 16
   EVAL_PERIOD: 10
   CHECKPOINT_PERIOD: 1
   AUTO_RESUME: True
@@ -12,6 +12,8 @@ DATA:
   TRAIN_CROP_SIZE: 224
   TEST_CROP_SIZE: 256
   INPUT_CHANNEL_NUM: [3]
+  PATH_TO_DATA_DIR: dataset/tiny-kinetics-400/data
+  DECODING_BACKEND: pyav
 RESNET:
   ZERO_INIT_FINAL_BN: True
   WIDTH_PER_GROUP: 64
@@ -43,13 +45,14 @@ MODEL:
   LOSS_FUNC: cross_entropy
   DROPOUT_RATE: 0.5
 TEST:
-  ENABLE: True
+  ENABLE: False
   DATASET: kinetics
   BATCH_SIZE: 64
 DATA_LOADER:
   NUM_WORKERS: 8
   PIN_MEMORY: True
-NUM_GPUS: 8
+NUM_GPUS: 0
 NUM_SHARDS: 1
+DIST_BACKEND: "gloo"
 RNG_SEED: 0
 OUTPUT_DIR: .
diff --git a/install_requirements.sh b/install_requirements.sh
new file mode 100644
index 000000000..058e28f39
--- /dev/null
+++ b/install_requirements.sh
@@ -0,0 +1,10 @@
+pip install 'git+https://github.com/facebookresearch/fvcore'
+pip install simplejson
+pip install av
+pip install -U iopath
+pip install psutil
+pip install opencv-python
+pip install tensorboard
+pip install moviepy
+pip install pytorchvideo
+pip install -e detectron2_repo
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 185c7d64f..0853b3895 100644
--- a/setup.py
+++ b/setup.py
@@ -23,7 +23,7 @@
         "opencv-python",
         "pandas",
         "torchvision>=0.4.2",
-        "PIL",
+        "Pillow",
         "sklearn",
         "tensorboard",
         "fairscale",
diff --git a/slowfast/datasets/decoder.py b/slowfast/datasets/decoder.py
index 7b7aff4ae..0c9f1f920 100644
--- a/slowfast/datasets/decoder.py
+++ b/slowfast/datasets/decoder.py
@@ -3,8 +3,9 @@
 
 import logging
 import math
-import numpy as np
 import random
+
+import numpy as np
 import torch
 import torchvision.io as io
 
@@ -84,7 +85,7 @@ def get_multiple_start_end_idx(
     num_clips_uniform,
     min_delta=0,
     max_delta=math.inf,
-    use_offset=False,
+    use_offset=False
 ):
     """
     Sample a clip of size clip_size from a video of size video_size and
@@ -114,7 +115,7 @@ def sample_clips(
         min_delta=0,
         max_delta=math.inf,
         num_retries=100,
-        use_offset=False,
+        use_offset=False
     ):
         se_inds = np.empty((0, 2))
         dt = np.empty((0))
@@ -125,15 +126,13 @@ def sample_clips(
                 if clip_idx == -1:
                     # Random temporal sampling.
                     start_idx = random.uniform(0, max_start)
-                else:  # Uniformly sample the clip with the given index.
+                else: # Uniformly sample the clip with the given index.
                     if use_offset:
                         if num_clips_uniform == 1:
                             # Take the center clip if num_clips is 1.
                             start_idx = math.floor(max_start / 2)
                         else:
-                            start_idx = clip_idx * math.floor(
-                                max_start / (num_clips_uniform - 1)
-                            )
+                            start_idx = clip_idx * math.floor(max_start / (num_clips_uniform - 1))
                     else:
                         start_idx = max_start * clip_idx / num_clips_uniform
 
@@ -304,7 +303,10 @@ def torchvision_decode(
         decode_all_video = False  # try selective decoding
 
         clip_sizes = [
-            np.maximum(1.0, sampling_rate[i] * num_frames[i] / target_fps * fps)
+            np.maximum(
+                1.0,
+                sampling_rate[i] * num_frames[i] / target_fps * fps
+            )
             for i in range(len(sampling_rate))
         ]
         start_end_delta_time = get_multiple_start_end_idx(
@@ -381,6 +383,10 @@ def pyav_decode(
     num_clips_uniform=10,
     target_fps=30,
     use_offset=False,
+    modalities=("visual",),
+    max_spatial_scale=0,
+    min_delta=-math.inf,
+    max_delta=math.inf,
 ):
     """
     Convert the video from its original fps to the target_fps. If the video
@@ -388,7 +394,6 @@ def pyav_decode(
     the perform temporal selective decoding and sample a clip from the video
     with the PyAV decoder. If the video does not support selective decoding,
     decode the entire video.
-
     Args:
         container (container): pyav container.
         sampling_rate (int): frame sampling rate (interval between two sampled
@@ -418,38 +423,69 @@ def pyav_decode(
         # If failed to fetch the decoding information, decode the entire video.
         decode_all_video = True
         video_start_pts, video_end_pts = 0, math.inf
+        start_end_delta_time = None
+
+        frames = None
+        if container.streams.video:
+            video_frames, max_pts = pyav_decode_stream(
+                container,
+                video_start_pts,
+                video_end_pts,
+                container.streams.video[0],
+                {"video": 0},
+            )
+            container.close()
+
+            frames = [frame.to_rgb().to_ndarray() for frame in video_frames]
+            frames = torch.as_tensor(np.stack(frames))
+        frames_out = [frames]
+
     else:
         # Perform selective decoding.
         decode_all_video = False
-        clip_size = np.maximum(
-            1.0, np.ceil(sampling_rate * (num_frames - 1) / target_fps * fps)
-        )
-        start_idx, end_idx, fraction = get_start_end_idx(
+        clip_sizes = [
+            np.maximum(
+                1.0,
+                np.ceil(
+                    sampling_rate[i] * (num_frames[i] - 1) / target_fps * fps
+                ),
+            )
+            for i in range(len(sampling_rate))
+        ]
+        start_end_delta_time = get_multiple_start_end_idx(
             frames_length,
-            clip_size,
+            clip_sizes,
             clip_idx,
             num_clips_uniform,
-            use_offset=use_offset,
-        )
-        timebase = duration / frames_length
-        video_start_pts = int(start_idx * timebase)
-        video_end_pts = int(end_idx * timebase)
-
-    frames = None
-    # If video stream was found, fetch video frames from the video.
-    if container.streams.video:
-        video_frames, max_pts = pyav_decode_stream(
-            container,
-            video_start_pts,
-            video_end_pts,
-            container.streams.video[0],
-            {"video": 0},
+            min_delta=min_delta,
+            max_delta=max_delta,
         )
+        frames_out = [None] * len(num_frames)
+        for k in range(len(num_frames)):
+            start_idx = start_end_delta_time[k, 0]
+            end_idx = start_end_delta_time[k, 1]
+            timebase = duration / frames_length
+            video_start_pts = int(start_idx * timebase)
+            video_end_pts = int(end_idx * timebase)
+
+            frames = None
+            # If video stream was found, fetch video frames from the video.
+            if container.streams.video:
+                video_frames, max_pts = pyav_decode_stream(
+                    container,
+                    video_start_pts,
+                    video_end_pts,
+                    container.streams.video[0],
+                    {"video": 0},
+                )
+
+                frames = [frame.to_rgb().to_ndarray() for frame in video_frames]
+                frames = torch.as_tensor(np.stack(frames))
+
+            frames_out[k] = frames
         container.close()
 
-        frames = [frame.to_rgb().to_ndarray() for frame in video_frames]
-        frames = torch.as_tensor(np.stack(frames))
-    return frames, fps, decode_all_video
+    return frames_out, fps, decode_all_video, start_end_delta_time
 
 
 def decode(
@@ -509,10 +545,9 @@ def decode(
         )  # clips come temporally ordered from decoder
     try:
         if backend == "pyav":
-            assert (
-                min_delta == -math.inf and max_delta == math.inf
-            ), "delta sampling not supported in pyav"
-            frames_decoded, fps, decode_all_video = pyav_decode(
+            assert min_delta == -math.inf and max_delta == math.inf, \
+                "delta sampling not supported in pyav"
+            frames_decoded, fps, decode_all_video, start_end_delta_time = pyav_decode(
                 container,
                 sampling_rate,
                 num_frames,
@@ -520,6 +555,10 @@ def decode(
                 num_clips_uniform,
                 target_fps,
                 use_offset=use_offset,
+                modalities=("visual",),
+                max_spatial_scale=max_spatial_scale,
+                min_delta=min_delta,
+                max_delta=max_delta,
             )
         elif backend == "torchvision":
             (
@@ -557,7 +596,10 @@ def decode(
         frames_decoded = [frames_decoded]
     num_decoded = len(frames_decoded)
     clip_sizes = [
-        np.maximum(1.0, sampling_rate[i] * num_frames[i] / target_fps * fps)
+        np.maximum(
+            1.0,
+            sampling_rate[i] * num_frames[i] / target_fps * fps
+        )
         for i in range(len(sampling_rate))
     ]
 
@@ -621,4 +663,4 @@ def decode(
             for i in range(num_decode)
         )
 
-    return frames_out, start_end_delta_time, time_diff_aug
+    return frames_out, start_end_delta_time, time_diff_aug
\ No newline at end of file
diff --git a/slowfast/utils/misc.py b/slowfast/utils/misc.py
index 48e1ffac8..e951bc717 100644
--- a/slowfast/utils/misc.py
+++ b/slowfast/utils/misc.py
@@ -412,6 +412,7 @@ def launch_job(cfg, init_method, func, daemon=False):
             daemonic processes will be created
     """
     if cfg.NUM_GPUS > 1:
+        print("GPU Number is {}".format(cfg.NUM_GPUS))
         torch.multiprocessing.spawn(
             mpu.run,
             nprocs=cfg.NUM_GPUS,
@@ -427,6 +428,7 @@ def launch_job(cfg, init_method, func, daemon=False):
             daemon=daemon,
         )
     else:
+        print("GPU Number is 0")
         func(cfg=cfg)
 
 
diff --git a/tools/train_net.py b/tools/train_net.py
index 49835f8d3..369a33bb5 100644
--- a/tools/train_net.py
+++ b/tools/train_net.py
@@ -136,6 +136,7 @@ def train_epoch(
                 preds, labels = model(inputs)
             else:
                 preds = model(inputs)
+            print("cur iteration ", cur_iter)
             if cfg.TASK == "ssl" and cfg.MODEL.MODEL_NAME == "ContrastiveModel":
                 labels = torch.zeros(
                     preds.size(0), dtype=labels.dtype, device=labels.device
@@ -271,12 +272,12 @@ def train_epoch(
                 )
         train_meter.iter_toc()  # do measure allreduce for this meter
         train_meter.log_iter_stats(cur_epoch, cur_iter)
-        torch.cuda.synchronize()
+        # torch.cuda.synchronize()
         train_meter.iter_tic()
     del inputs
 
     # in case of fragmented memory
-    torch.cuda.empty_cache()
+    # torch.cuda.empty_cache()
 
     # Log epoch stats.
     train_meter.log_epoch_stats(cur_epoch)

From da5de7221f8ed23025e659895956fcf7092e3a20 Mon Sep 17 00:00:00 2001
From: leonardozcm <leonardo1997zcm@gmail.com>
Date: Fri, 16 Sep 2022 14:47:47 +0800
Subject: [PATCH 2/2] update

---
 orcaexample/kinetics.py | 32 ++++++++++++++++----------------
 slowfast/utils/misc.py  |  2 --
 tools/train_net.py      |  1 -
 3 files changed, 16 insertions(+), 19 deletions(-)

diff --git a/orcaexample/kinetics.py b/orcaexample/kinetics.py
index 449867e6d..f747b451c 100644
--- a/orcaexample/kinetics.py
+++ b/orcaexample/kinetics.py
@@ -95,22 +95,22 @@ def loss_creator(config):
                        )
     val_stats = orca_estimator.evaluate(data=validation_data_creator(cfg,cfg.TEST.BATCH_SIZE))
     print("===> Validation Complete: Top1Accuracy {}".format(val_stats["Accuracy"]))
-# elif args.backend in ["ray", "spark"]:
-#     orca_estimator = Estimator.from_torch(model=model_creator,
-#                                           optimizer=optim_creator,
-#                                           loss=loss_creator,
-#                                           metrics=[Accuracy()],
-#                                           backend=args.backend,
-#                                           config=cfg,
-#                                           model_dir=os.getcwd(),
-#                                           use_tqdm=True)
-#     orca_estimator.fit(data=train_loader_creator,
-#                        validation_data=validation_data_creator,
-#                        batch_size=cfg.TRAIN.BATCH_SIZE,
-#                        epochs=cfg.SOLVER.MAX_EPOCH)
-#     val_stats = orca_estimator.evaluate(data=validation_data_creator, batch_size=cfg.TEST.BATCH_SIZE)
-#     print("===> Validation Complete: Top1Accuracy {}".format(val_stats["Accuracy"]))
-#     orca_estimator.shutdown()
+elif args.backend in ["ray", "spark"]:
+    orca_estimator = Estimator.from_torch(model=model_creator,
+                                          optimizer=optim_creator,
+                                          loss=loss_creator,
+                                          metrics=[Accuracy()],
+                                          backend=args.backend,
+                                          config=cfg,
+                                          model_dir=os.getcwd(),
+                                          use_tqdm=True)
+    orca_estimator.fit(data=train_loader_creator,
+                       validation_data=validation_data_creator,
+                       batch_size=cfg.TRAIN.BATCH_SIZE,
+                       epochs=cfg.SOLVER.MAX_EPOCH)
+    val_stats = orca_estimator.evaluate(data=validation_data_creator, batch_size=cfg.TEST.BATCH_SIZE)
+    print("===> Validation Complete: Top1Accuracy {}".format(val_stats["Accuracy"]))
+    orca_estimator.shutdown()
 else:
     invalidInputError(False, "Only bigdl, ray, and spark are supported "
                         "as the backend, but got {}".format(args.backend))
diff --git a/slowfast/utils/misc.py b/slowfast/utils/misc.py
index e951bc717..48e1ffac8 100644
--- a/slowfast/utils/misc.py
+++ b/slowfast/utils/misc.py
@@ -412,7 +412,6 @@ def launch_job(cfg, init_method, func, daemon=False):
             daemonic processes will be created
     """
     if cfg.NUM_GPUS > 1:
-        print("GPU Number is {}".format(cfg.NUM_GPUS))
         torch.multiprocessing.spawn(
             mpu.run,
             nprocs=cfg.NUM_GPUS,
@@ -428,7 +427,6 @@ def launch_job(cfg, init_method, func, daemon=False):
             daemon=daemon,
         )
     else:
-        print("GPU Number is 0")
         func(cfg=cfg)
 
 
diff --git a/tools/train_net.py b/tools/train_net.py
index 034cacfc8..b8b05b61f 100644
--- a/tools/train_net.py
+++ b/tools/train_net.py
@@ -136,7 +136,6 @@ def train_epoch(
                 preds, labels = model(inputs)
             else:
                 preds = model(inputs)
-            print("cur iteration ", cur_iter)
             if cfg.TASK == "ssl" and cfg.MODEL.MODEL_NAME == "ContrastiveModel":
                 labels = torch.zeros(
                     preds.size(0), dtype=labels.dtype, device=labels.device