diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000000000000000000000000000000000000..c286ad0c15dff1a3fcd779d0b0e220b6b7455185 --- /dev/null +++ b/.flake8 @@ -0,0 +1,8 @@ +# This is an example .flake8 config, used when developing *Black* itself. +# Keep in sync with setup.cfg which is used for source packages. + +[flake8] +ignore = E203, E266, E501, W503 +max-line-length = 80 +max-complexity = 18 +select = B,C,E,F,W,T4,B9 diff --git a/.github/ISSUE_TEMPLATE/bug-report.md b/.github/ISSUE_TEMPLATE/bug-report.md new file mode 100644 index 0000000000000000000000000000000000000000..101235f7a10ce7b5919eadbdf2c5a975a7047b39 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug-report.md @@ -0,0 +1,49 @@ +--- +name: "\U0001F41B Bug Report" +about: Submit a bug report to help us improve Mask R-CNN Benchmark + +--- + +## 🐛 Bug + + + +## To Reproduce + +Steps to reproduce the behavior: + +1. +1. +1. + + + +## Expected behavior + + + +## Environment + +Please copy and paste the output from the +[environment collection script from PyTorch](https://raw.githubusercontent.com/pytorch/pytorch/master/torch/utils/collect_env.py) +(or fill out the checklist below manually). + +You can get the script and run it with: +``` +wget https://raw.githubusercontent.com/pytorch/pytorch/master/torch/utils/collect_env.py +# For security purposes, please check the contents of collect_env.py before running it. +python collect_env.py +``` + + - PyTorch Version (e.g., 1.0): + - OS (e.g., Linux): + - How you installed PyTorch (`conda`, `pip`, source): + - Build command you used (if compiling from source): + - Python version: + - CUDA/cuDNN version: + - GPU models and configuration: + - Any other relevant information: + +## Additional context + + diff --git a/.github/ISSUE_TEMPLATE/feature-request.md b/.github/ISSUE_TEMPLATE/feature-request.md new file mode 100644 index 0000000000000000000000000000000000000000..6c87456464034ece2b99bc0499006ba2730ab8c6 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature-request.md @@ -0,0 +1,24 @@ +--- +name: "\U0001F680Feature Request" +about: Submit a proposal/request for a new Mask R-CNN Benchmark feature + +--- + +## 🚀 Feature + + +## Motivation + + + +## Pitch + + + +## Alternatives + + + +## Additional context + + diff --git a/.github/ISSUE_TEMPLATE/questions-help-support.md b/.github/ISSUE_TEMPLATE/questions-help-support.md new file mode 100644 index 0000000000000000000000000000000000000000..992f1b5f61035c196ace628decc77473a7f93f4c --- /dev/null +++ b/.github/ISSUE_TEMPLATE/questions-help-support.md @@ -0,0 +1,7 @@ +--- +name: "❓Questions/Help/Support" +about: Do you need support? + +--- + +## ❓ Questions and Help diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..2a222171139927240b103c86c31582160b3051ab --- /dev/null +++ b/.gitignore @@ -0,0 +1,33 @@ +# compilation and distribution +__pycache__ +_ext +*.pyc +*.so +maskrcnn_benchmark.egg-info/ +build/ +dist/ + +# pytorch/python/numpy formats +*.pth +*.pkl +*.npy + +# ipython/jupyter notebooks +*.ipynb +**/.ipynb_checkpoints/ + +# Editor temporaries +*.swn +*.swo +*.swp +*~ + +# Pycharm editor settings +.idea + +# project dirs +/datasets +/models + +.DS_Store +.pdf \ No newline at end of file diff --git a/ABSTRACTIONS.md b/ABSTRACTIONS.md new file mode 100644 index 0000000000000000000000000000000000000000..cdb3c428722418cdc2bb9af5018cefa454338e7d --- /dev/null +++ b/ABSTRACTIONS.md @@ -0,0 +1,65 @@ +## Abstractions +The main abstractions introduced by `maskrcnn_benchmark` that are useful to +have in mind are the following: + +### ImageList +In PyTorch, the first dimension of the input to the network generally represents +the batch dimension, and thus all elements of the same batch have the same +height / width. +In order to support images with different sizes and aspect ratios in the same +batch, we created the `ImageList` class, which holds internally a batch of +images (os possibly different sizes). The images are padded with zeros such that +they have the same final size and batched over the first dimension. The original +sizes of the images before padding are stored in the `image_sizes` attribute, +and the batched tensor in `tensors`. +We provide a convenience function `to_image_list` that accepts a few different +input types, including a list of tensors, and returns an `ImageList` object. + +```python +from maskrcnn_benchmark.structures.image_list import to_image_list + +images = [torch.rand(3, 100, 200), torch.rand(3, 150, 170)] +batched_images = to_image_list(images) + +# it is also possible to make the final batched image be a multiple of a number +batched_images_32 = to_image_list(images, size_divisible=32) +``` + +### BoxList +The `BoxList` class holds a set of bounding boxes (represented as a `Nx4` tensor) for +a specific image, as well as the size of the image as a `(width, height)` tuple. +It also contains a set of methods that allow to perform geometric +transformations to the bounding boxes (such as cropping, scaling and flipping). +The class accepts bounding boxes from two different input formats: +- `xyxy`, where each box is encoded as a `x1`, `y1`, `x2` and `y2` coordinates, and +- `xywh`, where each box is encoded as `x1`, `y1`, `w` and `h`. + +Additionally, each `BoxList` instance can also hold arbitrary additional information +for each bounding box, such as labels, visibility, probability scores etc. + +Here is an example on how to create a `BoxList` from a list of coordinates: +```python +from maskrcnn_benchmark.structures.bounding_box import BoxList, FLIP_LEFT_RIGHT + +width = 100 +height = 200 +boxes = [ + [0, 10, 50, 50], + [50, 20, 90, 60], + [10, 10, 50, 50] +] +# create a BoxList with 3 boxes +bbox = BoxList(boxes, image_size=(width, height), mode='xyxy') + +# perform some box transformations, has similar API as PIL.Image +bbox_scaled = bbox.resize((width * 2, height * 3)) +bbox_flipped = bbox.transpose(FLIP_LEFT_RIGHT) + +# add labels for each bbox +labels = torch.tensor([0, 10, 1]) +bbox.add_field('labels', labels) + +# bbox also support a few operations, like indexing +# here, selects boxes 0 and 2 +bbox_subset = bbox[[0, 2]] +``` diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 0000000000000000000000000000000000000000..0f7ad8bfc173eac554f0b6ef7c684861e8014bbe --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,5 @@ +# Code of Conduct + +Facebook has adopted a Code of Conduct that we expect project participants to adhere to. +Please read the [full text](https://code.fb.com/codeofconduct/) +so that you can understand what actions will and will not be tolerated. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000000000000000000000000000000000000..fc14cd3c73f6952be905f17e15a9a909a4561bb8 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,39 @@ +# Contributing to Mask-RCNN Benchmark +We want to make contributing to this project as easy and transparent as +possible. + +## Our Development Process +Minor changes and improvements will be released on an ongoing basis. Larger changes (e.g., changesets implementing a new paper) will be released on a more periodic basis. + +## Pull Requests +We actively welcome your pull requests. + +1. Fork the repo and create your branch from `master`. +2. If you've added code that should be tested, add tests. +3. If you've changed APIs, update the documentation. +4. Ensure the test suite passes. +5. Make sure your code lints. +6. If you haven't already, complete the Contributor License Agreement ("CLA"). + +## Contributor License Agreement ("CLA") +In order to accept your pull request, we need you to submit a CLA. You only need +to do this once to work on any of Facebook's open source projects. + +Complete your CLA here: + +## Issues +We use GitHub issues to track public bugs. Please ensure your description is +clear and has sufficient instructions to be able to reproduce the issue. + +Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe +disclosure of security bugs. In those cases, please go through the process +outlined on that page and do not file a public issue. + +## Coding Style +* 4 spaces for indentation rather than tabs +* 80 character line length +* PEP8 formatting following [Black](https://black.readthedocs.io/en/stable/) + +## License +By contributing to Mask-RCNN Benchmark, you agree that your contributions will be licensed +under the LICENSE file in the root directory of this source tree. diff --git a/INSTALL.md b/INSTALL.md new file mode 100644 index 0000000000000000000000000000000000000000..caa1f3b78b2d2b7bc6321625c96efaa7a109dc27 --- /dev/null +++ b/INSTALL.md @@ -0,0 +1,73 @@ +## Installation + +### Requirements: +- PyTorch 1.0 from a nightly release. Installation instructions can be found in https://pytorch.org/get-started/locally/ +- torchvision from master +- cocoapi +- yacs +- matplotlib +- GCC >= 4.9 +- (optional) OpenCV for the webcam demo + + +### Option 1: Step-by-step installation + +```bash +# first, make sure that your conda is setup properly with the right environment +# for that, check that `which conda`, `which pip` and `which python` points to the +# right path. From a clean conda env, this is what you need to do + +conda create --name maskrcnn_benchmark +conda activate maskrcnn_benchmark + +# this installs the right pip and dependencies for the fresh python +conda install ipython + +export INSTALL_DIR=$PWD + + +# install pycocotools +cd $INSTALL_DIR +git clone https://github.com/cocodataset/cocoapi.git +cd cocoapi/PythonAPI +python setup.py build_ext install + +# install PyTorch Detection +cd $INSTALL_DIR +git clone https://github.com/facebookresearch/maskrcnn-benchmark.git +cd maskrcnn-benchmark + +# maskrcnn_benchmark and coco api dependencies +pip install -r requirements.txt + +# follow PyTorch installation in https://pytorch.org/get-started/locally/ +# we give the instructions for CUDA 9.0 +conda install -c pytorch pytorch torchvision cudatoolkit=9.0 + +# the following will install the lib with +# symbolic links, so that you can modify +# the files if you want and won't need to +# re-build it +python setup.py build develop + + +unset INSTALL_DIR + +# or if you are on macOS +# MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++ python setup.py build develop +``` + +### Option 2: Docker Image (Requires CUDA, Linux only) + +Build image with defaults (`CUDA=9.0`, `CUDNN=7`): + + nvidia-docker build -t maskrcnn-benchmark docker/ + +Build image with other CUDA and CUDNN versions: + + nvidia-docker build -t maskrcnn-benchmark --build-arg CUDA=9.2 --build-arg CUDNN=7 docker/ + +Build and run image with built-in jupyter notebook(note that the password is used to log in jupyter notebook): + + nvidia-docker build -t maskrcnn-benchmark-jupyter docker/docker-jupyter/ + nvidia-docker run -td -p 8888:8888 -e PASSWORD= -v : maskrcnn-benchmark-jupyter diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..8585e11b83ab25bea5fbe4b8230fbf909f8b296b --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2018 Facebook + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/MODEL_ZOO.md b/MODEL_ZOO.md new file mode 100644 index 0000000000000000000000000000000000000000..a0276d3d5d882311a5ac7633e7f3e88e71a40ce1 --- /dev/null +++ b/MODEL_ZOO.md @@ -0,0 +1,110 @@ +## Model Zoo and Baselines + +### Hardware +- 8 NVIDIA V100 GPUs + +### Software +- PyTorch version: 1.0.0a0+dd2c487 +- CUDA 9.2 +- CUDNN 7.1 +- NCCL 2.2.13-1 + +### End-to-end Faster and Mask R-CNN baselines + +All the baselines were trained using the exact same experimental setup as in Detectron. +We initialize the detection models with ImageNet weights from Caffe2, the same as used by Detectron. + +The pre-trained models are available in the link in the model id. + +backbone | type | lr sched | im / gpu | train mem(GB) | train time (s/iter) | total train time(hr) | inference time(s/im) | box AP | mask AP | model id +-- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- +R-50-C4 | Fast | 1x | 1 | 5.8 | 0.4036 | 20.2 | 0.17130 | 34.8 | - | [6358800](https://download.pytorch.org/models/maskrcnn/e2e_faster_rcnn_R_50_C4_1x.pth) +R-50-FPN | Fast | 1x | 2 | 4.4 | 0.3530 | 8.8 | 0.12580 | 36.8 | - | [6358793](https://download.pytorch.org/models/maskrcnn/e2e_faster_rcnn_R_50_FPN_1x.pth) +R-101-FPN | Fast | 1x | 2 | 7.1 | 0.4591 | 11.5 | 0.143149 | 39.1 | - | [6358804](https://download.pytorch.org/models/maskrcnn/e2e_faster_rcnn_R_101_FPN_1x.pth) +X-101-32x8d-FPN | Fast | 1x | 1 | 7.6 | 0.7007 | 35.0 | 0.209965 | 41.2 | - | [6358717](https://download.pytorch.org/models/maskrcnn/e2e_faster_rcnn_X_101_32x8d_FPN_1x.pth) +R-50-C4 | Mask | 1x | 1 | 5.8 | 0.4520 | 22.6 | 0.17796 + 0.028 | 35.6 | 31.5 | [6358801](https://download.pytorch.org/models/maskrcnn/e2e_mask_rcnn_R_50_C4_1x.pth) +R-50-FPN | Mask | 1x | 2 | 5.2 | 0.4536 | 11.3 | 0.12966 + 0.034 | 37.8 | 34.2 | [6358792](https://download.pytorch.org/models/maskrcnn/e2e_mask_rcnn_R_50_FPN_1x.pth) +R-101-FPN | Mask | 1x | 2 | 7.9 | 0.5665 | 14.2 | 0.15384 + 0.034 | 40.1 | 36.1 | [6358805](https://download.pytorch.org/models/maskrcnn/e2e_mask_rcnn_R_101_FPN_1x.pth) +X-101-32x8d-FPN | Mask | 1x | 1 | 7.8 | 0.7562 | 37.8 | 0.21739 + 0.034 | 42.2 | 37.8 | [6358718](https://download.pytorch.org/models/maskrcnn/e2e_mask_rcnn_X_101_32x8d_FPN_1x.pth) + +For person keypoint detection: + +backbone | type | lr sched | im / gpu | train mem(GB) | train time (s/iter) | total train time(hr) | inference time(s/im) | box AP | keypoint AP | model id +-- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- +R-50-FPN | Keypoint | 1x | 2 | 5.7 | 0.3771 | 9.4 | 0.10941 | 53.7 | 64.3 | 9981060 + +### Light-weight Model baselines + +We provided pre-trained models for selected FBNet models. +* All the models are trained from scratched with BN using the training schedule specified below. +* Evaluation is performed on a single NVIDIA V100 GPU with `MODEL.RPN.POST_NMS_TOP_N_TEST` set to `200`. + +The following inference time is reported: + * inference total batch=8: Total inference time including data loading, model inference and pre/post preprocessing using 8 images per batch. + * inference model batch=8: Model inference time only and using 8 images per batch. + * inference model batch=1: Model inference time only and using 1 image per batch. + * inferenee caffe2 batch=1: Model inference time for the model in Caffe2 format using 1 image per batch. The Caffe2 models fused the BN to Conv and purely run on C++/CUDA by using Caffe2 ops for rpn/detection post processing. + +The pre-trained models are available in the link in the model id. + +backbone | type | resolution | lr sched | im / gpu | train mem(GB) | train time (s/iter) | total train time (hr) | inference total batch=8 (s/im) | inference model batch=8 (s/im) | inference model batch=1 (s/im) | inference caffe2 batch=1 (s/im) | box AP | mask AP | model id +-- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- +[R-50-C4](configs/e2e_faster_rcnn_R_50_C4_1x.yaml) (reference) | Fast | 800 | 1x | 1 | 5.8 | 0.4036 | 20.2 | 0.0875 | **0.0793** | 0.0831 | **0.0625** | 34.4 | - | f35857197 +[fbnet_chamv1a](configs/e2e_faster_rcnn_fbnet_chamv1a_600.yaml) | Fast | 600 | 0.75x | 12 | 13.6 | 0.5444 | 20.5 | 0.0315 | **0.0260** | 0.0376 | **0.0188** | 33.5 | - | [f100940543](https://download.pytorch.org/models/maskrcnn/e2e_faster_rcnn_fbnet_chamv1a_600.pth) +[fbnet_default](configs/e2e_faster_rcnn_fbnet_600.yaml) | Fast | 600 | 0.5x | 16 | 11.1 | 0.4872 | 12.5 | 0.0316 | **0.0250** | 0.0297 | **0.0130** | 28.2 | - | [f101086388](https://download.pytorch.org/models/maskrcnn/e2e_faster_rcnn_fbnet_600.pth) +[R-50-C4](configs/e2e_mask_rcnn_R_50_C4_1x.yaml) (reference) | Mask | 800 | 1x | 1 | 5.8 | 0.452 | 22.6 | 0.0918 | **0.0848** | 0.0844 | - | 35.2 | 31.0 | f35858791 +[fbnet_xirb16d](configs/e2e_mask_rcnn_fbnet_xirb16d_dsmask_600.yaml) | Mask | 600 | 0.5x | 16 | 13.4 | 1.1732 | 29 | 0.0386 | **0.0319** | 0.0356 | - | 30.7 | 26.9 | [f101086394](https://download.pytorch.org/models/maskrcnn/e2e_mask_rcnn_fbnet_xirb16d_dsmask.pth) +[fbnet_default](configs/e2e_mask_rcnn_fbnet_600.yaml) | Mask | 600 | 0.5x | 16 | 13.0 | 0.9036 | 23.0 | 0.0327 | **0.0269** | 0.0385 | - | 29.0 | 26.1 | [f101086385](https://download.pytorch.org/models/maskrcnn/e2e_mask_rcnn_fbnet_600.pth) + +## Comparison with Detectron and mmdetection + +In the following section, we compare our implementation with [Detectron](https://github.com/facebookresearch/Detectron) +and [mmdetection](https://github.com/open-mmlab/mmdetection). +The same remarks from [mmdetection](https://github.com/open-mmlab/mmdetection/blob/master/MODEL_ZOO.md#training-speed) +about different hardware applies here. + +### Training speed + +The numbers here are in seconds / iteration. The lower, the better. + +type | Detectron (P100) | mmdetection (V100) | maskrcnn_benchmark (V100) +-- | -- | -- | -- +Faster R-CNN R-50 C4 | 0.566 | - | 0.4036 +Faster R-CNN R-50 FPN | 0.544 | 0.554 | 0.3530 +Faster R-CNN R-101 FPN | 0.647 | - | 0.4591 +Faster R-CNN X-101-32x8d FPN | 0.799 | - | 0.7007 +Mask R-CNN R-50 C4 | 0.620 | - | 0.4520 +Mask R-CNN R-50 FPN | 0.889 | 0.690 | 0.4536 +Mask R-CNN R-101 FPN | 1.008 | - | 0.5665 +Mask R-CNN X-101-32x8d FPN | 0.961 | - | 0.7562 + +### Training memory + +The lower, the better + +type | Detectron (P100) | mmdetection (V100) | maskrcnn_benchmark (V100) +-- | -- | -- | -- +Faster R-CNN R-50 C4 | 6.3 | - | 5.8 +Faster R-CNN R-50 FPN | 7.2 | 4.9 | 4.4 +Faster R-CNN R-101 FPN | 8.9 | - | 7.1 +Faster R-CNN X-101-32x8d FPN | 7.0 | - | 7.6 +Mask R-CNN R-50 C4 | 6.6 | - | 5.8 +Mask R-CNN R-50 FPN | 8.6 | 5.9 | 5.2 +Mask R-CNN R-101 FPN | 10.2 | - | 7.9 +Mask R-CNN X-101-32x8d FPN | 7.7 | - | 7.8 + +### Accuracy + +The higher, the better + +type | Detectron (P100) | mmdetection (V100) | maskrcnn_benchmark (V100) +-- | -- | -- | -- +Faster R-CNN R-50 C4 | 34.8 | - | 34.8 +Faster R-CNN R-50 FPN | 36.7 | 36.7 | 36.8 +Faster R-CNN R-101 FPN | 39.4 | - | 39.1 +Faster R-CNN X-101-32x8d FPN | 41.3 | - | 41.2 +Mask R-CNN R-50 C4 | 35.8 & 31.4 | - | 35.6 & 31.5 +Mask R-CNN R-50 FPN | 37.7 & 33.9 | 37.5 & 34.4 | 37.8 & 34.2 +Mask R-CNN R-101 FPN | 40.0 & 35.9 | - | 40.1 & 36.1 +Mask R-CNN X-101-32x8d FPN | 42.1 & 37.3 | - | 42.2 & 37.8 + diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..08eeb04dd976b65ccce55766ffad7109f75dd26e --- /dev/null +++ b/README.md @@ -0,0 +1,83 @@ +# Iterative Knowledge Distillation for Automatic Check-Out + +Introduction +----------------- +Automatic Check-Out (ACO) provides an object detection based mechanism for retailers to process the purchases of customers automatically. However, it suffers a lot from the domain shift problem because of different data distribution between single item in training exemplar images and mixed items in testing checkout images. In this paper, we propose a new iterative knowledge distillation method to solve the domain adaptation problem for this task. First, we develop a new augmentation data strategy to generate synthesized checkout images. It can extract segmented items from the training images by the coarse-to-fine strategy and filter items with realistic poses by pose pruning. Second, we propose a dual pyramid scale network (DPSNet) to exploit multi-scale feature representation in joint detection and counting views. Third, the iterative training strategy is developed to make full use of both image-level and instance-level samples to learn common feature representation and narrow the gap between source domain and target domain. Extensive experiments on the large-scale Retail Product Checkout (RPC) dataset show the proposed DPSNet can achieve state-of-the-art performance compared with existing methods. + +![DPNet](demo/DPSNET.jpg) + +## Installation + +Check [INSTALL.md](INSTALL.md) for installation instructions. + +## Inference + +Run inference with pre-trained models using this command. Then images with boxes, labels and scores will +be saved to `rpc_results` folder. + +```bash +python demo/rpc_demo.py --config-file configs/e2e_faster_rcnn_R_101_FPN_1x_rpc_xxx.yaml --images_dir /path/to/test2019 +``` + +## Prepare dataset + +Using `toolboxes` to extract masks, train [Salient Object Detection](https://github.com/AceCoooool/DSS-pytorch) +and render with [CycleGAN](https://github.com/junyanz/pytorch-CycleGAN-and-pix2pix). Then modify `maskrcnn_benchmark/config/paths_catalog.py` +to make the paths correct. + +## Single GPU training + +Most of the configuration files that we provide assume that we are running on 4 GPUs. +In order to be able to run it on fewer GPUs, there are a few possibilities: + +**1. Run the following without modifications** + +```bash +python tools/train_net.py --config-file "/path/to/config/file.yaml" +``` +This should work out of the box and is very similar to what we should do for multi-GPU training. +But the drawback is that it will use much more GPU memory. The reason is that we set in the +configuration files a global batch size that is divided over the number of GPUs. So if we only +have a single GPU, this means that the batch size for that GPU will be 8x larger, which might lead +to out-of-memory errors. + +If you have a lot of memory available, this is the easiest solution. + +**2. Modify the cfg parameters** + +If you experience out-of-memory errors, you can reduce the global batch size. But this means that +you'll also need to change the learning rate, the number of iterations and the learning rate schedule. + +Here is an example for Mask R-CNN R-50 FPN with the 1x schedule: +```bash +python tools/train_net.py --config-file "configs/e2e_mask_rcnn_R_50_FPN_1x.yaml" SOLVER.IMS_PER_BATCH 2 SOLVER.BASE_LR 0.0025 SOLVER.MAX_ITER 720000 SOLVER.STEPS "(480000, 640000)" TEST.IMS_PER_BATCH 1 +``` +This follows the [scheduling rules from Detectron.](https://github.com/facebookresearch/Detectron/blob/master/configs/getting_started/tutorial_1gpu_e2e_faster_rcnn_R-50-FPN.yaml#L14-L30) +Note that we have multiplied the number of iterations by 8x (as well as the learning rate schedules), +and we have divided the learning rate by 8x. + +We also changed the batch size during testing, but that is generally not necessary because testing +requires much less memory than training. + + +## Multi-GPU training +We use internally `torch.distributed.launch` in order to launch +multi-gpu training. This utility function from PyTorch spawns as many +Python processes as the number of GPUs we want to use, and each Python +process will only use a single GPU. + +```bash +export NGPUS=4 +python -m torch.distributed.launch --nproc_per_node=$NGPUS tools/train_net.py --config-file "path/to/config/file.yaml" +``` +## Results + +| level | method | cAcc | mCIoU | ACD | mCCD | mAP50 | mmAP | +| ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | +| averaged | Render+IS (DPSNet) | 88.14% | 98.66% | 0.17 | 0.01 | 98.64% | 98.64% | + +## Citations +Please consider citing this project in your publications if it helps your research. The following is a BibTeX reference. The BibTeX entry requires the `url` LaTeX package. +``` +TODO +``` \ No newline at end of file diff --git a/TROUBLESHOOTING.md b/TROUBLESHOOTING.md new file mode 100644 index 0000000000000000000000000000000000000000..63a8b598b96a643cbd11ed84bbc06dc747683d67 --- /dev/null +++ b/TROUBLESHOOTING.md @@ -0,0 +1,67 @@ +# Troubleshooting + +Here is a compilation if common issues that you might face +while compiling / running this code: + +## Compilation errors when compiling the library +If you encounter build errors like the following: +``` +/usr/include/c++/6/type_traits:1558:8: note: provided for ‘template struct std::is_convertible’ + struct is_convertible + ^~~~~~~~~~~~~~ +/usr/include/c++/6/tuple:502:1: error: body of constexpr function ‘static constexpr bool std::_TC<, _Elements>::_NonNestedTuple() [with _SrcTuple = std::tuple&&; bool = true; _Elements = {at::Tensor, at::Tensor, at::Tensor, at::Tensor}]’ not a return-statement + } + ^ +error: command '/usr/local/cuda/bin/nvcc' failed with exit status 1 +``` +check your CUDA version and your `gcc` version. +``` +nvcc --version +gcc --version +``` +If you are using CUDA 9.0 and gcc 6.4.0, then refer to https://github.com/facebookresearch/maskrcnn-benchmark/issues/25, +which has a summary of the solution. Basically, CUDA 9.0 is not compatible with gcc 6.4.0. + +## ImportError: No module named maskrcnn_benchmark.config when running webcam.py + +This means that `maskrcnn-benchmark` has not been properly installed. +Refer to https://github.com/facebookresearch/maskrcnn-benchmark/issues/22 for a few possible issues. +Note that we now support Python 2 as well. + + +## ImportError: Undefined symbol: __cudaPopCallConfiguration error when import _C + +This probably means that the inconsistent version of NVCC compile and your conda CUDAToolKit package. This is firstly mentioned in https://github.com/facebookresearch/maskrcnn-benchmark/issues/45 . All you need to do is: + +``` +# Check the NVCC compile version(e.g.) +/usr/cuda-9.2/bin/nvcc --version +# Check the CUDAToolKit version(e.g.) +~/anaconda3/bin/conda list | grep cuda + +# If you need to update your CUDAToolKit +~/anaconda3/bin/conda install -c anaconda cudatoolkit==9.2 +``` + +Both of them should have the **same** version. For example, if NVCC==9.2 and CUDAToolKit==9.2, this will be fine while when NVCC==9.2 but CUDAToolKit==9, it fails. + + +## Segmentation fault (core dumped) when running the library +This probably means that you have compiled the library using GCC < 4.9, which is ABI incompatible with PyTorch. +Indeed, during installation, you probably saw a message like +``` +Your compiler (g++ 4.8) may be ABI-incompatible with PyTorch! +Please use a compiler that is ABI-compatible with GCC 4.9 and above. +See https://gcc.gnu.org/onlinedocs/libstdc++/manual/abi.html. + +See https://gist.github.com/goldsborough/d466f43e8ffc948ff92de7486c5216d6 +for instructions on how to install GCC 4.9 or higher. +``` +Follow the instructions on https://gist.github.com/goldsborough/d466f43e8ffc948ff92de7486c5216d6 +to install GCC 4.9 or higher, and try recompiling `maskrcnn-benchmark` again, after cleaning the +`build` folder with +``` +rm -rf build +``` + + diff --git a/configs/e2e_faster_rcnn_R_101_FPN_1x.yaml b/configs/e2e_faster_rcnn_R_101_FPN_1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..83b0c13f96ef45fcf98a126460e11ab8d14a45dd --- /dev/null +++ b/configs/e2e_faster_rcnn_R_101_FPN_1x.yaml @@ -0,0 +1,44 @@ +MODEL: + META_ARCHITECTURE: "GeneralizedRCNN" + WEIGHT: "catalog://ImageNetPretrained/MSRA/R-101" + BACKBONE: + CONV_BODY: "R-101-FPN" + RESNETS: + BACKBONE_OUT_CHANNELS: 256 + RPN: + USE_FPN: True + ANCHOR_STRIDE: (4, 8, 16, 32, 64) + PRE_NMS_TOP_N_TRAIN: 2000 + PRE_NMS_TOP_N_TEST: 1000 + POST_NMS_TOP_N_TEST: 1000 + FPN_POST_NMS_TOP_N_TEST: 1000 + ROI_HEADS: + USE_FPN: True + ROI_BOX_HEAD: + POOLER_RESOLUTION: 7 + POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) + POOLER_SAMPLING_RATIO: 2 + FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" + PREDICTOR: "FPNPredictor" + DENSITY_HEAD: + NUM_CLASSES: 12 + FPN_LEVEL: 1 + FPN_LEVEL_STRIDE: 0.125 + HEATMAP_ON: True +DATASETS: + TRAIN: ("coco_2014_train_density", "coco_2014_valminusminival_density") + TEST: ("coco_2014_minival_density", ) +DATALOADER: + SIZE_DIVISIBILITY: 32 +SOLVER: + BASE_LR: 0.01 + WEIGHT_DECAY: 0.0001 + STEPS: (120000, 160000) + MAX_ITER: 180000 + CHECKPOINT_PERIOD: 5000 + IMS_PER_BATCH: 8 +TEST: + IMS_PER_BATCH: 8 + GENERATE_PSEUDO_LABELS: True + +OUTPUT_DIR: 'work_dirs/outputs_coco_density_12' \ No newline at end of file diff --git a/configs/e2e_faster_rcnn_R_101_FPN_1x_finetune.yaml b/configs/e2e_faster_rcnn_R_101_FPN_1x_finetune.yaml new file mode 100644 index 0000000000000000000000000000000000000000..679eac98e5605c79c0ca417368234edc084157a8 --- /dev/null +++ b/configs/e2e_faster_rcnn_R_101_FPN_1x_finetune.yaml @@ -0,0 +1,41 @@ +MODEL: + META_ARCHITECTURE: "GeneralizedRCNN" + WEIGHT: "catalog://ImageNetPretrained/MSRA/R-101" + BACKBONE: + CONV_BODY: "R-101-FPN" + RESNETS: + BACKBONE_OUT_CHANNELS: 256 + RPN: + USE_FPN: True + ANCHOR_STRIDE: (4, 8, 16, 32, 64) + PRE_NMS_TOP_N_TRAIN: 2000 + PRE_NMS_TOP_N_TEST: 1000 + POST_NMS_TOP_N_TEST: 1000 + FPN_POST_NMS_TOP_N_TEST: 1000 + ROI_HEADS: + USE_FPN: True + ROI_BOX_HEAD: + POOLER_RESOLUTION: 7 + POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) + POOLER_SAMPLING_RATIO: 2 + FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" + PREDICTOR: "FPNPredictor" + HEATMAP_ON: False +DATASETS: + TRAIN: ("coco_2014_train", "coco_2014_valminusminival") + TEST: ("coco_2014_minival",) +DATALOADER: + SIZE_DIVISIBILITY: 32 +SOLVER: + BASE_LR: 0.0002 + WEIGHT_DECAY: 0.0001 + STEPS: (50000, 60000) + MAX_ITER: 70000 + CHECKPOINT_PERIOD: 2500 + WARMUP_ITERS: 0 + IMS_PER_BATCH: 8 +TEST: + IMS_PER_BATCH: 8 + GENERATE_PSEUDO_LABELS: False + +OUTPUT_DIR: 'outputs_coco_density_finetune' \ No newline at end of file diff --git a/configs/e2e_faster_rcnn_R_101_FPN_1x_rpc_finetune.yaml b/configs/e2e_faster_rcnn_R_101_FPN_1x_rpc_finetune.yaml new file mode 100644 index 0000000000000000000000000000000000000000..847a473a5b6be3fabde0a64cf854d5286f5ce226 --- /dev/null +++ b/configs/e2e_faster_rcnn_R_101_FPN_1x_rpc_finetune.yaml @@ -0,0 +1,39 @@ +MODEL: + META_ARCHITECTURE: "GeneralizedRCNN" + WEIGHT: "CHANGE/ME" + BACKBONE: + CONV_BODY: "R-101-FPN" + RESNETS: + BACKBONE_OUT_CHANNELS: 256 + RPN: + USE_FPN: True + ANCHOR_STRIDE: (4, 8, 16, 32, 64) + PRE_NMS_TOP_N_TRAIN: 2000 + PRE_NMS_TOP_N_TEST: 1000 + POST_NMS_TOP_N_TEST: 1000 + FPN_POST_NMS_TOP_N_TEST: 1000 + ROI_HEADS: + USE_FPN: True + ROI_BOX_HEAD: + POOLER_RESOLUTION: 7 + POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) + POOLER_SAMPLING_RATIO: 2 + FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" + PREDICTOR: "FPNPredictor" + NUM_CLASSES: 201 +DATASETS: + TRAIN: ("rpc_2019_pseudo",) + TEST: ("rpc_2019_test",) +DATALOADER: + SIZE_DIVISIBILITY: 32 + ASPECT_RATIO_GROUPING: False +SOLVER: + BASE_LR: 0.001 + WEIGHT_DECAY: 0.0001 + STEPS: (50000, 60000) + MAX_ITER: 70000 + IMS_PER_BATCH: 8 +TEST: + IMS_PER_BATCH: 8 + +OUTPUT_DIR: 'outputs_rpc_2019_train_render_final_density_map_finetune' \ No newline at end of file diff --git a/configs/e2e_faster_rcnn_R_101_FPN_1x_rpc_instance_select.yaml b/configs/e2e_faster_rcnn_R_101_FPN_1x_rpc_instance_select.yaml new file mode 100644 index 0000000000000000000000000000000000000000..37f4aea2cf73df923be7946c6b712baa9aef8196 --- /dev/null +++ b/configs/e2e_faster_rcnn_R_101_FPN_1x_rpc_instance_select.yaml @@ -0,0 +1,40 @@ +MODEL: + META_ARCHITECTURE: "GeneralizedRCNN" + WEIGHT: "CHANGE/ME" + BACKBONE: + CONV_BODY: "R-101-FPN" + RESNETS: + BACKBONE_OUT_CHANNELS: 256 + RPN: + USE_FPN: True + ANCHOR_STRIDE: (4, 8, 16, 32, 64) + PRE_NMS_TOP_N_TRAIN: 2000 + PRE_NMS_TOP_N_TEST: 1000 + POST_NMS_TOP_N_TEST: 1000 + FPN_POST_NMS_TOP_N_TEST: 1000 + ROI_HEADS: + USE_FPN: True + ROI_BOX_HEAD: + POOLER_RESOLUTION: 7 + POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) + POOLER_SAMPLING_RATIO: 2 + FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" + PREDICTOR: "FPNPredictor" + NUM_CLASSES: 201 +DATASETS: + TRAIN: ("rpc_2019_instance_select",) + TEST: ("rpc_2019_test",) +DATALOADER: + SIZE_DIVISIBILITY: 32 + ASPECT_RATIO_GROUPING: False +SOLVER: + BASE_LR: 0.001 + WEIGHT_DECAY: 0.0001 + STEPS: (2500, 5000, 7500) + MAX_ITER: 10000 + IMS_PER_BATCH: 8 + WARMUP_ITERS: 0 +TEST: + IMS_PER_BATCH: 4 + +OUTPUT_DIR: 'outputs_rpc_2019_train_render_final_density_map_instance_select_paper' \ No newline at end of file diff --git a/configs/e2e_faster_rcnn_R_101_FPN_1x_rpc_render.yaml b/configs/e2e_faster_rcnn_R_101_FPN_1x_rpc_render.yaml new file mode 100644 index 0000000000000000000000000000000000000000..45081d42f40d927c208f4d9a9f1788fd8cd10ab1 --- /dev/null +++ b/configs/e2e_faster_rcnn_R_101_FPN_1x_rpc_render.yaml @@ -0,0 +1,39 @@ +MODEL: + META_ARCHITECTURE: "GeneralizedRCNN" + WEIGHT: "catalog://ImageNetPretrained/MSRA/R-101" + BACKBONE: + CONV_BODY: "R-101-FPN" + RESNETS: + BACKBONE_OUT_CHANNELS: 256 + RPN: + USE_FPN: True + ANCHOR_STRIDE: (4, 8, 16, 32, 64) + PRE_NMS_TOP_N_TRAIN: 2000 + PRE_NMS_TOP_N_TEST: 1000 + POST_NMS_TOP_N_TEST: 1000 + FPN_POST_NMS_TOP_N_TEST: 1000 + ROI_HEADS: + USE_FPN: True + ROI_BOX_HEAD: + POOLER_RESOLUTION: 7 + POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) + POOLER_SAMPLING_RATIO: 2 + FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" + PREDICTOR: "FPNPredictor" + NUM_CLASSES: 201 +DATASETS: + TRAIN: ("rpc_2019_train_render",) + TEST: ("rpc_images",) +DATALOADER: + SIZE_DIVISIBILITY: 32 + ASPECT_RATIO_GROUPING: False +SOLVER: + BASE_LR: 0.01 + WEIGHT_DECAY: 0.0001 + STEPS: (120000, 160000) + MAX_ITER: 180000 + IMS_PER_BATCH: 8 +TEST: + IMS_PER_BATCH: 4 + +OUTPUT_DIR: 'outputs_rpc_2019_train_render' \ No newline at end of file diff --git a/configs/e2e_faster_rcnn_R_101_FPN_1x_rpc_render_cross_finetune.yaml b/configs/e2e_faster_rcnn_R_101_FPN_1x_rpc_render_cross_finetune.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ac48cf3fefcb5636b5087582ea4cbe0ab645cc89 --- /dev/null +++ b/configs/e2e_faster_rcnn_R_101_FPN_1x_rpc_render_cross_finetune.yaml @@ -0,0 +1,45 @@ +MODEL: + META_ARCHITECTURE: "AdaptionRCNN" + WEIGHT: "CHANGE/ME" + BACKBONE: + CONV_BODY: "R-101-FPN" + RESNETS: + BACKBONE_OUT_CHANNELS: 256 + RPN: + USE_FPN: True + ANCHOR_STRIDE: (4, 8, 16, 32, 64) + PRE_NMS_TOP_N_TRAIN: 2000 + PRE_NMS_TOP_N_TEST: 1000 + POST_NMS_TOP_N_TEST: 1000 + FPN_POST_NMS_TOP_N_TEST: 1000 + ROI_HEADS: + USE_FPN: True + ROI_BOX_HEAD: + POOLER_RESOLUTION: 7 + POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) + POOLER_SAMPLING_RATIO: 2 + FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" + PREDICTOR: "FPNPredictor" + NUM_CLASSES: 201 + HEATMAP_ON: True +DATASETS: + TRAIN: ("rpc_2019_train_render_density_map",) + TEST: ("rpc_2019_val",) +DATALOADER: + SIZE_DIVISIBILITY: 32 + ASPECT_RATIO_GROUPING: False + NUM_WORKERS: 8 +SOLVER: + BASE_LR: 0.001 + WEIGHT_DECAY: 0.0001 + STEPS: (70000, 100000, 120000) + MAX_ITER: 140000 + CROSS_TRAIN_STEPS: 14 + ITER_PER_STEP: 10000 + IMS_PER_BATCH: 8 +TEST: + IMS_PER_BATCH: 8 + GENERATE_PSEUDO_LABELS: True + PSEUDO_LABELS_ANN_FILE: '/data7/lufficc/projects/DPNet/outputs_rpc_2019_train_render_final_density_map/inference/rpc_2019_test/pseudo_labeling.json' + +OUTPUT_DIR: 'outputs_rpc_2019_train_render_density_map_cross_finetune' \ No newline at end of file diff --git a/configs/e2e_faster_rcnn_R_101_FPN_1x_rpc_render_cross_finetune_17.yaml b/configs/e2e_faster_rcnn_R_101_FPN_1x_rpc_render_cross_finetune_17.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9545a805bf51c81d5086b10af095fc6d7733ddfd --- /dev/null +++ b/configs/e2e_faster_rcnn_R_101_FPN_1x_rpc_render_cross_finetune_17.yaml @@ -0,0 +1,49 @@ +MODEL: + META_ARCHITECTURE: "AdaptionRCNN" + WEIGHT: "CHANGE/ME" + BACKBONE: + CONV_BODY: "R-101-FPN" + RESNETS: + BACKBONE_OUT_CHANNELS: 256 + RPN: + USE_FPN: True + ANCHOR_STRIDE: (4, 8, 16, 32, 64) + PRE_NMS_TOP_N_TRAIN: 2000 + PRE_NMS_TOP_N_TEST: 1000 + POST_NMS_TOP_N_TEST: 1000 + FPN_POST_NMS_TOP_N_TEST: 1000 + ROI_HEADS: + USE_FPN: True + ROI_BOX_HEAD: + POOLER_RESOLUTION: 7 + POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) + POOLER_SAMPLING_RATIO: 2 + FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" + PREDICTOR: "FPNPredictor" + NUM_CLASSES: 201 + DENSITY_HEAD: + NUM_CLASSES: 17 + FPN_LEVEL: 1 + FPN_LEVEL_STRIDE: 0.125 + HEATMAP_ON: True +DATASETS: + TRAIN: ("rpc_2019_train_render_density_map",) + TEST: ("rpc_2019_val",) +DATALOADER: + SIZE_DIVISIBILITY: 32 + ASPECT_RATIO_GROUPING: False + NUM_WORKERS: 12 +SOLVER: + BASE_LR: 0.001 + WEIGHT_DECAY: 0.0001 + STEPS: (70000, 100000, 120000) + MAX_ITER: 140000 + CROSS_TRAIN_STEPS: 14 + ITER_PER_STEP: 10000 + IMS_PER_BATCH: 8 +TEST: + IMS_PER_BATCH: 8 + GENERATE_PSEUDO_LABELS: True + PSEUDO_LABELS_ANN_FILE: '/data7/lufficc/projects/DPNet/work_dirs/rpc_2019_train_render_density_map_17/inference/rpc_2019_test/pseudo_labeling.json' + +OUTPUT_DIR: 'work_dirs/rpc_2019_train_render_density_map_cross_finetune_17' \ No newline at end of file diff --git a/configs/e2e_faster_rcnn_R_101_FPN_1x_rpc_render_cross_finetune_200.yaml b/configs/e2e_faster_rcnn_R_101_FPN_1x_rpc_render_cross_finetune_200.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3592184d05fd73c3e4a2c5238181ee6309a7354e --- /dev/null +++ b/configs/e2e_faster_rcnn_R_101_FPN_1x_rpc_render_cross_finetune_200.yaml @@ -0,0 +1,49 @@ +MODEL: + META_ARCHITECTURE: "AdaptionRCNN" + WEIGHT: "CHANGE/ME" + BACKBONE: + CONV_BODY: "R-101-FPN" + RESNETS: + BACKBONE_OUT_CHANNELS: 256 + RPN: + USE_FPN: True + ANCHOR_STRIDE: (4, 8, 16, 32, 64) + PRE_NMS_TOP_N_TRAIN: 2000 + PRE_NMS_TOP_N_TEST: 1000 + POST_NMS_TOP_N_TEST: 1000 + FPN_POST_NMS_TOP_N_TEST: 1000 + ROI_HEADS: + USE_FPN: True + ROI_BOX_HEAD: + POOLER_RESOLUTION: 7 + POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) + POOLER_SAMPLING_RATIO: 2 + FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" + PREDICTOR: "FPNPredictor" + NUM_CLASSES: 201 + DENSITY_HEAD: + NUM_CLASSES: 200 + FPN_LEVEL: 1 + FPN_LEVEL_STRIDE: 0.125 + HEATMAP_ON: True +DATASETS: + TRAIN: ("rpc_2019_train_render_density_map",) + TEST: ("rpc_2019_val",) +DATALOADER: + SIZE_DIVISIBILITY: 32 + ASPECT_RATIO_GROUPING: False + NUM_WORKERS: 12 +SOLVER: + BASE_LR: 0.001 + WEIGHT_DECAY: 0.0001 + STEPS: (70000, 100000, 120000) + MAX_ITER: 140000 + CROSS_TRAIN_STEPS: 14 + ITER_PER_STEP: 10000 + IMS_PER_BATCH: 8 +TEST: + IMS_PER_BATCH: 8 + GENERATE_PSEUDO_LABELS: True + PSEUDO_LABELS_ANN_FILE: '/data7/lufficc/projects/DPNet/work_dirs/rpc_2019_train_render_density_map_200/inference/rpc_2019_test/pseudo_labeling.json' + +OUTPUT_DIR: 'work_dirs/rpc_2019_train_render_density_map_cross_finetune_200' \ No newline at end of file diff --git a/configs/e2e_faster_rcnn_R_101_FPN_1x_rpc_render_cross_finetune_gt.yaml b/configs/e2e_faster_rcnn_R_101_FPN_1x_rpc_render_cross_finetune_gt.yaml new file mode 100644 index 0000000000000000000000000000000000000000..40f99c2998e786ff0d2be885e0b0a9bb51c3de1e --- /dev/null +++ b/configs/e2e_faster_rcnn_R_101_FPN_1x_rpc_render_cross_finetune_gt.yaml @@ -0,0 +1,48 @@ +MODEL: + META_ARCHITECTURE: "AdaptionRCNN" + WEIGHT: "CHANGE/ME" + BACKBONE: + CONV_BODY: "R-101-FPN" + RESNETS: + BACKBONE_OUT_CHANNELS: 256 + RPN: + USE_FPN: True + ANCHOR_STRIDE: (4, 8, 16, 32, 64) + PRE_NMS_TOP_N_TRAIN: 2000 + PRE_NMS_TOP_N_TEST: 1000 + POST_NMS_TOP_N_TEST: 1000 + FPN_POST_NMS_TOP_N_TEST: 1000 + ROI_HEADS: + USE_FPN: True + ROI_BOX_HEAD: + POOLER_RESOLUTION: 7 + POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) + POOLER_SAMPLING_RATIO: 2 + FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" + PREDICTOR: "FPNPredictor" + NUM_CLASSES: 201 + HEATMAP_ON: True +DATASETS: + TRAIN: ("rpc_2019_train_render_density_map",) + TEST: ("rpc_2019_val",) +DATALOADER: + SIZE_DIVISIBILITY: 32 + ASPECT_RATIO_GROUPING: False + NUM_WORKERS: 8 +SOLVER: + BASE_LR: 0.001 + WEIGHT_DECAY: 0.0001 + STEPS: (70000, 100000, 120000) + MAX_ITER: 140000 + CROSS_TRAIN_STEPS: 14 + ITER_PER_STEP: 10000 + CHECKPOINT_PERIOD: 5000 + IMS_PER_BATCH: 8 +TEST: + IMS_PER_BATCH: 8 + GENERATE_PSEUDO_LABELS: True + USE_GROUND_TRUTH: True + PSEUDO_LABELS_ANN_FILE: '/data7/lufficc/projects/DPNet/outputs_rpc_2019_train_render_final_density_map/inference/rpc_2019_test/pseudo_labeling_gt.json' + + +OUTPUT_DIR: 'outputs_rpc_2019_train_render_density_map_cross_finetune_gt_paper' \ No newline at end of file diff --git a/configs/e2e_faster_rcnn_R_101_FPN_1x_rpc_render_density_map.yaml b/configs/e2e_faster_rcnn_R_101_FPN_1x_rpc_render_density_map.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6374373e926ba547aa1319d7583a10b4e9d854d3 --- /dev/null +++ b/configs/e2e_faster_rcnn_R_101_FPN_1x_rpc_render_density_map.yaml @@ -0,0 +1,42 @@ +MODEL: + META_ARCHITECTURE: "GeneralizedRCNN" + WEIGHT: "catalog://ImageNetPretrained/MSRA/R-101" + BACKBONE: + CONV_BODY: "R-101-FPN" + RESNETS: + BACKBONE_OUT_CHANNELS: 256 + RPN: + USE_FPN: True + ANCHOR_STRIDE: (4, 8, 16, 32, 64) + PRE_NMS_TOP_N_TRAIN: 2000 + PRE_NMS_TOP_N_TEST: 1000 + POST_NMS_TOP_N_TEST: 1000 + FPN_POST_NMS_TOP_N_TEST: 1000 + ROI_HEADS: + USE_FPN: True + ROI_BOX_HEAD: + POOLER_RESOLUTION: 7 + POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) + POOLER_SAMPLING_RATIO: 2 + FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" + PREDICTOR: "FPNPredictor" + NUM_CLASSES: 201 + HEATMAP_ON: True +DATASETS: + TRAIN: ("rpc_2019_train_render_density_map",) + TEST: ("rpc_2019_val",) +DATALOADER: + SIZE_DIVISIBILITY: 32 + ASPECT_RATIO_GROUPING: False + NUM_WORKERS: 8 +SOLVER: + BASE_LR: 0.01 + WEIGHT_DECAY: 0.0001 + STEPS: (120000, 160000) + MAX_ITER: 180000 + IMS_PER_BATCH: 8 +TEST: + IMS_PER_BATCH: 8 + GENERATE_PSEUDO_LABELS: True + +OUTPUT_DIR: 'outputs_rpc_2019_train_render_density_map' \ No newline at end of file diff --git a/configs/e2e_faster_rcnn_R_101_FPN_1x_rpc_render_density_map_17.yaml b/configs/e2e_faster_rcnn_R_101_FPN_1x_rpc_render_density_map_17.yaml new file mode 100644 index 0000000000000000000000000000000000000000..caa981d028411b39ba42b2d64bcb73e571267e39 --- /dev/null +++ b/configs/e2e_faster_rcnn_R_101_FPN_1x_rpc_render_density_map_17.yaml @@ -0,0 +1,50 @@ +MODEL: + META_ARCHITECTURE: "GeneralizedRCNN" + WEIGHT: "catalog://ImageNetPretrained/MSRA/R-101" + BACKBONE: + CONV_BODY: "R-101-FPN" + RESNETS: + BACKBONE_OUT_CHANNELS: 256 + RPN: + USE_FPN: True + ANCHOR_STRIDE: (4, 8, 16, 32, 64) + PRE_NMS_TOP_N_TRAIN: 2000 + PRE_NMS_TOP_N_TEST: 1000 + POST_NMS_TOP_N_TEST: 1000 + FPN_POST_NMS_TOP_N_TEST: 1000 + ROI_HEADS: + USE_FPN: True + ROI_BOX_HEAD: + POOLER_RESOLUTION: 7 + POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) + POOLER_SAMPLING_RATIO: 2 + FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" + PREDICTOR: "FPNPredictor" + NUM_CLASSES: 201 + DENSITY_HEAD: +# NUM_CLASSES: 17 +# FPN_LEVEL: 2 +# FPN_LEVEL_STRIDE: 0.0625 + NUM_CLASSES: 17 + FPN_LEVEL: 1 + FPN_LEVEL_STRIDE: 0.125 + HEATMAP_ON: True +DATASETS: + TRAIN: ("rpc_2019_train_render_density_map",) + TEST: ("rpc_2019_test",) +DATALOADER: + SIZE_DIVISIBILITY: 32 + ASPECT_RATIO_GROUPING: False + NUM_WORKERS: 12 +SOLVER: + BASE_LR: 0.01 + WEIGHT_DECAY: 0.0001 + STEPS: (120000, 160000) + MAX_ITER: 180000 + IMS_PER_BATCH: 8 + CHECKPOINT_PERIOD: 2500 +TEST: + IMS_PER_BATCH: 8 + GENERATE_PSEUDO_LABELS: True + +OUTPUT_DIR: 'work_dirs/rpc_2019_train_render_density_map_17' \ No newline at end of file diff --git a/configs/e2e_faster_rcnn_R_101_FPN_1x_rpc_render_density_map_17_50x50.yaml b/configs/e2e_faster_rcnn_R_101_FPN_1x_rpc_render_density_map_17_50x50.yaml new file mode 100644 index 0000000000000000000000000000000000000000..4a65212f1a511170e46f6876c3b701122758b4cf --- /dev/null +++ b/configs/e2e_faster_rcnn_R_101_FPN_1x_rpc_render_density_map_17_50x50.yaml @@ -0,0 +1,50 @@ +MODEL: + META_ARCHITECTURE: "GeneralizedRCNN" + WEIGHT: "catalog://ImageNetPretrained/MSRA/R-101" + BACKBONE: + CONV_BODY: "R-101-FPN" + RESNETS: + BACKBONE_OUT_CHANNELS: 256 + RPN: + USE_FPN: True + ANCHOR_STRIDE: (4, 8, 16, 32, 64) + PRE_NMS_TOP_N_TRAIN: 2000 + PRE_NMS_TOP_N_TEST: 1000 + POST_NMS_TOP_N_TEST: 1000 + FPN_POST_NMS_TOP_N_TEST: 1000 + ROI_HEADS: + USE_FPN: True + ROI_BOX_HEAD: + POOLER_RESOLUTION: 7 + POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) + POOLER_SAMPLING_RATIO: 2 + FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" + PREDICTOR: "FPNPredictor" + NUM_CLASSES: 201 + DENSITY_HEAD: + NUM_CLASSES: 17 + FPN_LEVEL: 2 + FPN_LEVEL_STRIDE: 0.0625 +# NUM_CLASSES: 17 +# FPN_LEVEL: 1 +# FPN_LEVEL_STRIDE: 0.125 + HEATMAP_ON: True +DATASETS: + TRAIN: ("rpc_2019_train_render_density_map",) + TEST: ("rpc_2019_test",) +DATALOADER: + SIZE_DIVISIBILITY: 32 + ASPECT_RATIO_GROUPING: False + NUM_WORKERS: 12 +SOLVER: + BASE_LR: 0.01 + WEIGHT_DECAY: 0.0001 + STEPS: (120000, 160000) + MAX_ITER: 180000 + IMS_PER_BATCH: 8 + CHECKPOINT_PERIOD: 5000 +TEST: + IMS_PER_BATCH: 8 + GENERATE_PSEUDO_LABELS: True + +OUTPUT_DIR: 'work_dirs/rpc_2019_train_render_density_map_17_50x50' \ No newline at end of file diff --git a/configs/e2e_faster_rcnn_R_101_FPN_1x_rpc_render_density_map_200.yaml b/configs/e2e_faster_rcnn_R_101_FPN_1x_rpc_render_density_map_200.yaml new file mode 100644 index 0000000000000000000000000000000000000000..bd2802ec5d0c4a54fd6c86d3a69b7a57ec0d1f5b --- /dev/null +++ b/configs/e2e_faster_rcnn_R_101_FPN_1x_rpc_render_density_map_200.yaml @@ -0,0 +1,50 @@ +MODEL: + META_ARCHITECTURE: "GeneralizedRCNN" + WEIGHT: "catalog://ImageNetPretrained/MSRA/R-101" + BACKBONE: + CONV_BODY: "R-101-FPN" + RESNETS: + BACKBONE_OUT_CHANNELS: 256 + RPN: + USE_FPN: True + ANCHOR_STRIDE: (4, 8, 16, 32, 64) + PRE_NMS_TOP_N_TRAIN: 2000 + PRE_NMS_TOP_N_TEST: 1000 + POST_NMS_TOP_N_TEST: 1000 + FPN_POST_NMS_TOP_N_TEST: 1000 + ROI_HEADS: + USE_FPN: True + ROI_BOX_HEAD: + POOLER_RESOLUTION: 7 + POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) + POOLER_SAMPLING_RATIO: 2 + FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" + PREDICTOR: "FPNPredictor" + NUM_CLASSES: 201 + DENSITY_HEAD: +# NUM_CLASSES: 17 +# FPN_LEVEL: 2 +# FPN_LEVEL_STRIDE: 0.0625 + NUM_CLASSES: 200 + FPN_LEVEL: 1 + FPN_LEVEL_STRIDE: 0.125 + HEATMAP_ON: True +DATASETS: + TRAIN: ("rpc_2019_train_render_density_map",) + TEST: ("rpc_2019_test",) +DATALOADER: + SIZE_DIVISIBILITY: 32 + ASPECT_RATIO_GROUPING: False + NUM_WORKERS: 12 +SOLVER: + BASE_LR: 0.01 + WEIGHT_DECAY: 0.0001 + STEPS: (120000, 160000) + MAX_ITER: 180000 + IMS_PER_BATCH: 8 + CHECKPOINT_PERIOD: 2500 +TEST: + IMS_PER_BATCH: 8 + GENERATE_PSEUDO_LABELS: True + +OUTPUT_DIR: 'work_dirs/rpc_2019_train_render_density_map_200' \ No newline at end of file diff --git a/configs/e2e_faster_rcnn_R_101_FPN_1x_rpc_render_density_map_200_50x50.yaml b/configs/e2e_faster_rcnn_R_101_FPN_1x_rpc_render_density_map_200_50x50.yaml new file mode 100644 index 0000000000000000000000000000000000000000..907fbbc18e4a6fd63114a0c8871daac7d3d9b499 --- /dev/null +++ b/configs/e2e_faster_rcnn_R_101_FPN_1x_rpc_render_density_map_200_50x50.yaml @@ -0,0 +1,50 @@ +MODEL: + META_ARCHITECTURE: "GeneralizedRCNN" + WEIGHT: "catalog://ImageNetPretrained/MSRA/R-101" + BACKBONE: + CONV_BODY: "R-101-FPN" + RESNETS: + BACKBONE_OUT_CHANNELS: 256 + RPN: + USE_FPN: True + ANCHOR_STRIDE: (4, 8, 16, 32, 64) + PRE_NMS_TOP_N_TRAIN: 2000 + PRE_NMS_TOP_N_TEST: 1000 + POST_NMS_TOP_N_TEST: 1000 + FPN_POST_NMS_TOP_N_TEST: 1000 + ROI_HEADS: + USE_FPN: True + ROI_BOX_HEAD: + POOLER_RESOLUTION: 7 + POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) + POOLER_SAMPLING_RATIO: 2 + FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" + PREDICTOR: "FPNPredictor" + NUM_CLASSES: 201 + DENSITY_HEAD: + NUM_CLASSES: 200 + FPN_LEVEL: 2 + FPN_LEVEL_STRIDE: 0.0625 +# NUM_CLASSES: 200 +# FPN_LEVEL: 1 +# FPN_LEVEL_STRIDE: 0.125 + HEATMAP_ON: True +DATASETS: + TRAIN: ("rpc_2019_train_render_density_map",) + TEST: ("rpc_2019_val",) +DATALOADER: + SIZE_DIVISIBILITY: 32 + ASPECT_RATIO_GROUPING: False + NUM_WORKERS: 12 +SOLVER: + BASE_LR: 0.01 + WEIGHT_DECAY: 0.0001 + STEPS: (120000, 160000) + MAX_ITER: 180000 + IMS_PER_BATCH: 8 + CHECKPOINT_PERIOD: 5000 +TEST: + IMS_PER_BATCH: 8 + GENERATE_PSEUDO_LABELS: True + +OUTPUT_DIR: 'work_dirs/rpc_2019_train_render_density_map_200_50x50' \ No newline at end of file diff --git a/configs/e2e_faster_rcnn_R_101_FPN_1x_rpc_syn.yaml b/configs/e2e_faster_rcnn_R_101_FPN_1x_rpc_syn.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7837da84b78632edec641569a372ff9f8cfdc163 --- /dev/null +++ b/configs/e2e_faster_rcnn_R_101_FPN_1x_rpc_syn.yaml @@ -0,0 +1,39 @@ +MODEL: + META_ARCHITECTURE: "GeneralizedRCNN" + WEIGHT: "catalog://ImageNetPretrained/MSRA/R-101" + BACKBONE: + CONV_BODY: "R-101-FPN" + RESNETS: + BACKBONE_OUT_CHANNELS: 256 + RPN: + USE_FPN: True + ANCHOR_STRIDE: (4, 8, 16, 32, 64) + PRE_NMS_TOP_N_TRAIN: 2000 + PRE_NMS_TOP_N_TEST: 1000 + POST_NMS_TOP_N_TEST: 1000 + FPN_POST_NMS_TOP_N_TEST: 1000 + ROI_HEADS: + USE_FPN: True + ROI_BOX_HEAD: + POOLER_RESOLUTION: 7 + POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) + POOLER_SAMPLING_RATIO: 2 + FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" + PREDICTOR: "FPNPredictor" + NUM_CLASSES: 201 +DATASETS: + TRAIN: ("rpc_2019_train_syn",) + TEST: ("rpc_2019_val",) +DATALOADER: + SIZE_DIVISIBILITY: 32 + ASPECT_RATIO_GROUPING: False +SOLVER: + BASE_LR: 0.01 + WEIGHT_DECAY: 0.0001 + STEPS: (120000, 160000) + MAX_ITER: 180000 + IMS_PER_BATCH: 8 +TEST: + IMS_PER_BATCH: 4 + +OUTPUT_DIR: 'outputs_rpc_2019_train_syn' \ No newline at end of file diff --git a/configs/e2e_faster_rcnn_R_101_FPN_1x_rpc_syn_density_map.yaml b/configs/e2e_faster_rcnn_R_101_FPN_1x_rpc_syn_density_map.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b7d0d6c7a1e30bbc4937fb4b6f17ab31877af675 --- /dev/null +++ b/configs/e2e_faster_rcnn_R_101_FPN_1x_rpc_syn_density_map.yaml @@ -0,0 +1,40 @@ +MODEL: + META_ARCHITECTURE: "GeneralizedRCNN" + WEIGHT: "catalog://ImageNetPretrained/MSRA/R-101" + BACKBONE: + CONV_BODY: "R-101-FPN" + RESNETS: + BACKBONE_OUT_CHANNELS: 256 + RPN: + USE_FPN: True + ANCHOR_STRIDE: (4, 8, 16, 32, 64) + PRE_NMS_TOP_N_TRAIN: 2000 + PRE_NMS_TOP_N_TEST: 1000 + POST_NMS_TOP_N_TEST: 1000 + FPN_POST_NMS_TOP_N_TEST: 1000 + ROI_HEADS: + USE_FPN: True + ROI_BOX_HEAD: + POOLER_RESOLUTION: 7 + POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) + POOLER_SAMPLING_RATIO: 2 + FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" + PREDICTOR: "FPNPredictor" + NUM_CLASSES: 201 + HEATMAP_ON: True +DATASETS: + TRAIN: ("rpc_2019_train_syn_density_map",) + TEST: ("rpc_2019_val",) +DATALOADER: + SIZE_DIVISIBILITY: 32 + ASPECT_RATIO_GROUPING: False +SOLVER: + BASE_LR: 0.01 + WEIGHT_DECAY: 0.0001 + STEPS: (120000, 160000) + MAX_ITER: 180000 + IMS_PER_BATCH: 8 +TEST: + IMS_PER_BATCH: 4 + +OUTPUT_DIR: 'outputs_rpc_2019_train_syn_density_map' \ No newline at end of file diff --git a/configs/e2e_faster_rcnn_R_101_FPN_1x_rpc_syn_render.yaml b/configs/e2e_faster_rcnn_R_101_FPN_1x_rpc_syn_render.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9cbecfc18886d8342e7f14e9f4de4dc97f9dca27 --- /dev/null +++ b/configs/e2e_faster_rcnn_R_101_FPN_1x_rpc_syn_render.yaml @@ -0,0 +1,39 @@ +MODEL: + META_ARCHITECTURE: "GeneralizedRCNN" + WEIGHT: "catalog://ImageNetPretrained/MSRA/R-101" + BACKBONE: + CONV_BODY: "R-101-FPN" + RESNETS: + BACKBONE_OUT_CHANNELS: 256 + RPN: + USE_FPN: True + ANCHOR_STRIDE: (4, 8, 16, 32, 64) + PRE_NMS_TOP_N_TRAIN: 2000 + PRE_NMS_TOP_N_TEST: 1000 + POST_NMS_TOP_N_TEST: 1000 + FPN_POST_NMS_TOP_N_TEST: 1000 + ROI_HEADS: + USE_FPN: True + ROI_BOX_HEAD: + POOLER_RESOLUTION: 7 + POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) + POOLER_SAMPLING_RATIO: 2 + FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" + PREDICTOR: "FPNPredictor" + NUM_CLASSES: 201 +DATASETS: + TRAIN: ("rpc_2019_train_syn", "rpc_2019_train_render") + TEST: ("rpc_2019_val",) +DATALOADER: + SIZE_DIVISIBILITY: 32 + ASPECT_RATIO_GROUPING: False +SOLVER: + BASE_LR: 0.01 + WEIGHT_DECAY: 0.0001 + STEPS: (120000, 160000) + MAX_ITER: 180000 + IMS_PER_BATCH: 8 +TEST: + IMS_PER_BATCH: 4 + +OUTPUT_DIR: 'outputs_rpc_2019_train_syn_render' \ No newline at end of file diff --git a/configs/e2e_faster_rcnn_R_101_FPN_1x_rpc_syn_render_cross_finetune.yaml b/configs/e2e_faster_rcnn_R_101_FPN_1x_rpc_syn_render_cross_finetune.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6b33e8a348cb74fce208eeee062e72263b5d31a4 --- /dev/null +++ b/configs/e2e_faster_rcnn_R_101_FPN_1x_rpc_syn_render_cross_finetune.yaml @@ -0,0 +1,46 @@ +MODEL: + META_ARCHITECTURE: "AdaptionRCNN" + WEIGHT: "CHANGE/ME" + BACKBONE: + CONV_BODY: "R-101-FPN" + RESNETS: + BACKBONE_OUT_CHANNELS: 256 + RPN: + USE_FPN: True + ANCHOR_STRIDE: (4, 8, 16, 32, 64) + PRE_NMS_TOP_N_TRAIN: 2000 + PRE_NMS_TOP_N_TEST: 1000 + POST_NMS_TOP_N_TEST: 1000 + FPN_POST_NMS_TOP_N_TEST: 1000 + ROI_HEADS: + USE_FPN: True + ROI_BOX_HEAD: + POOLER_RESOLUTION: 7 + POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) + POOLER_SAMPLING_RATIO: 2 + FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" + PREDICTOR: "FPNPredictor" + NUM_CLASSES: 201 + HEATMAP_ON: True +DATASETS: + TRAIN: ("rpc_2019_train_render_density_map", "rpc_2019_train_syn_density_map") + TEST: ("rpc_2019_val",) +DATALOADER: + SIZE_DIVISIBILITY: 32 + ASPECT_RATIO_GROUPING: False + NUM_WORKERS: 8 +SOLVER: + BASE_LR: 0.001 + WEIGHT_DECAY: 0.0001 + STEPS: (70000, 100000, 120000) + MAX_ITER: 140000 + CROSS_TRAIN_STEPS: 14 + CROSS_TRAIN_DATA_RATIO: -100000 + ITER_PER_STEP: 10000 + IMS_PER_BATCH: 8 +TEST: + IMS_PER_BATCH: 8 + GENERATE_PSEUDO_LABELS: True + PSEUDO_LABELS_ANN_FILE: 'outputs_rpc_2019_train_syn_render_density_map_paper/inference/rpc_2019_test/pseudo_labeling.json' + +OUTPUT_DIR: 'outputs_rpc_2019_train_syn_render_cross_finetune_paper' \ No newline at end of file diff --git a/configs/e2e_faster_rcnn_R_101_FPN_1x_rpc_syn_render_density_map.yaml b/configs/e2e_faster_rcnn_R_101_FPN_1x_rpc_syn_render_density_map.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6dcc1545cd13c55fcc14e36cae86d05e037185e3 --- /dev/null +++ b/configs/e2e_faster_rcnn_R_101_FPN_1x_rpc_syn_render_density_map.yaml @@ -0,0 +1,42 @@ +MODEL: + META_ARCHITECTURE: "GeneralizedRCNN" + WEIGHT: "catalog://ImageNetPretrained/MSRA/R-101" + BACKBONE: + CONV_BODY: "R-101-FPN" + RESNETS: + BACKBONE_OUT_CHANNELS: 256 + RPN: + USE_FPN: True + ANCHOR_STRIDE: (4, 8, 16, 32, 64) + PRE_NMS_TOP_N_TRAIN: 2000 + PRE_NMS_TOP_N_TEST: 1000 + POST_NMS_TOP_N_TEST: 1000 + FPN_POST_NMS_TOP_N_TEST: 1000 + ROI_HEADS: + USE_FPN: True + ROI_BOX_HEAD: + POOLER_RESOLUTION: 7 + POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125) + POOLER_SAMPLING_RATIO: 2 + FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor" + PREDICTOR: "FPNPredictor" + NUM_CLASSES: 201 + HEATMAP_ON: True +DATASETS: + TRAIN: ("rpc_2019_train_render_density_map", "rpc_2019_train_syn_density_map") + TEST: ("rpc_2019_val",) +DATALOADER: + SIZE_DIVISIBILITY: 32 + ASPECT_RATIO_GROUPING: False + NUM_WORKERS: 8 +SOLVER: + BASE_LR: 0.01 + WEIGHT_DECAY: 0.0001 + STEPS: (120000, 160000) + MAX_ITER: 180000 + IMS_PER_BATCH: 8 +TEST: + IMS_PER_BATCH: 8 + GENERATE_PSEUDO_LABELS: True + +OUTPUT_DIR: 'outputs_rpc_2019_train_syn_render_density_map_paper' \ No newline at end of file diff --git a/demo/DPSNET.jpg b/demo/DPSNET.jpg new file mode 100644 index 0000000000000000000000000000000000000000..28575d79f644c87704cdb82c32afae74b11efc75 Binary files /dev/null and b/demo/DPSNET.jpg differ diff --git a/demo/predictor.py b/demo/predictor.py new file mode 100644 index 0000000000000000000000000000000000000000..425df783b14a6eb962b1691f9d56ddd035d27715 --- /dev/null +++ b/demo/predictor.py @@ -0,0 +1,377 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import cv2 +import torch +from torchvision import transforms as T + +from maskrcnn_benchmark.modeling.detector import build_detection_model +from maskrcnn_benchmark.utils.checkpoint import DetectronCheckpointer +from maskrcnn_benchmark.structures.image_list import to_image_list +from maskrcnn_benchmark.modeling.roi_heads.mask_head.inference import Masker +from maskrcnn_benchmark import layers as L +from maskrcnn_benchmark.utils import cv2_util + + +class COCODemo(object): + # RPC categories for pretty print + CATEGORIES = ['__background__', '1_puffed_food', '2_puffed_food', '3_puffed_food', '4_puffed_food', '5_puffed_food', '6_puffed_food', '7_puffed_food', + '8_puffed_food', '9_puffed_food', '10_puffed_food', '11_puffed_food', '12_puffed_food', '13_dried_fruit', '14_dried_fruit', '15_dried_fruit', + '16_dried_fruit', '17_dried_fruit', '18_dried_fruit', '19_dried_fruit', '20_dried_fruit', '21_dried_fruit', '22_dried_food', '23_dried_food', + '24_dried_food', '25_dried_food', '26_dried_food', '27_dried_food', '28_dried_food', '29_dried_food', '30_dried_food', '31_instant_drink', + '32_instant_drink', '33_instant_drink', '34_instant_drink', '35_instant_drink', '36_instant_drink', '37_instant_drink', '38_instant_drink', + '39_instant_drink', '40_instant_drink', '41_instant_drink', '42_instant_noodles', '43_instant_noodles', '44_instant_noodles', + '45_instant_noodles', '46_instant_noodles', '47_instant_noodles', '48_instant_noodles', '49_instant_noodles', '50_instant_noodles', + '51_instant_noodles', '52_instant_noodles', '53_instant_noodles', '54_dessert', '55_dessert', '56_dessert', '57_dessert', '58_dessert', + '59_dessert', '60_dessert', '61_dessert', '62_dessert', '63_dessert', '64_dessert', '65_dessert', '66_dessert', '67_dessert', '68_dessert', + '69_dessert', '70_dessert', '71_drink', '72_drink', '73_drink', '74_drink', '75_drink', '76_drink', '77_drink', '78_drink', '79_alcohol', + '80_alcohol', '81_drink', '82_drink', '83_drink', '84_drink', '85_drink', '86_drink', '87_drink', '88_alcohol', '89_alcohol', '90_alcohol', + '91_alcohol', '92_alcohol', '93_alcohol', '94_alcohol', '95_alcohol', '96_alcohol', '97_milk', '98_milk', '99_milk', '100_milk', '101_milk', + '102_milk', '103_milk', '104_milk', '105_milk', '106_milk', '107_milk', '108_canned_food', '109_canned_food', '110_canned_food', + '111_canned_food', '112_canned_food', '113_canned_food', '114_canned_food', '115_canned_food', '116_canned_food', '117_canned_food', + '118_canned_food', '119_canned_food', '120_canned_food', '121_canned_food', '122_chocolate', '123_chocolate', '124_chocolate', '125_chocolate', + '126_chocolate', '127_chocolate', '128_chocolate', '129_chocolate', '130_chocolate', '131_chocolate', '132_chocolate', '133_chocolate', '134_gum', + '135_gum', '136_gum', '137_gum', '138_gum', '139_gum', '140_gum', '141_gum', '142_candy', '143_candy', '144_candy', '145_candy', '146_candy', + '147_candy', '148_candy', '149_candy', '150_candy', '151_candy', '152_seasoner', '153_seasoner', '154_seasoner', '155_seasoner', '156_seasoner', + '157_seasoner', '158_seasoner', '159_seasoner', '160_seasoner', '161_seasoner', '162_seasoner', '163_seasoner', '164_personal_hygiene', + '165_personal_hygiene', '166_personal_hygiene', '167_personal_hygiene', '168_personal_hygiene', '169_personal_hygiene', '170_personal_hygiene', + '171_personal_hygiene', '172_personal_hygiene', '173_personal_hygiene', '174_tissue', '175_tissue', '176_tissue', '177_tissue', '178_tissue', + '179_tissue', '180_tissue', '181_tissue', '182_tissue', '183_tissue', '184_tissue', '185_tissue', '186_tissue', '187_tissue', '188_tissue', + '189_tissue', '190_tissue', '191_tissue', '192_tissue', '193_tissue', '194_stationery', '195_stationery', '196_stationery', '197_stationery', + '198_stationery', '199_stationery', '200_stationery'] + + def __init__( + self, + cfg, + confidence_threshold=0.7, + show_mask_heatmaps=False, + masks_per_dim=2, + min_image_size=224, + ): + self.cfg = cfg.clone() + self.model = build_detection_model(cfg) + self.model.eval() + self.device = torch.device(cfg.MODEL.DEVICE) + self.model.to(self.device) + self.min_image_size = min_image_size + + save_dir = cfg.OUTPUT_DIR + checkpointer = DetectronCheckpointer(cfg, self.model, save_dir=save_dir) + _ = checkpointer.load(cfg.MODEL.WEIGHT) + + self.transforms = self.build_transform() + + mask_threshold = -1 if show_mask_heatmaps else 0.5 + self.masker = Masker(threshold=mask_threshold, padding=1) + + # used to make colors for each class + self.palette = torch.tensor([2 ** 25 - 1, 2 ** 15 - 1, 2 ** 21 - 1]) + + self.cpu_device = torch.device("cpu") + self.confidence_threshold = confidence_threshold + self.show_mask_heatmaps = show_mask_heatmaps + self.masks_per_dim = masks_per_dim + + def build_transform(self): + """ + Creates a basic transformation that was used to train the models + """ + cfg = self.cfg + + # we are loading images with OpenCV, so we don't need to convert them + # to BGR, they are already! So all we need to do is to normalize + # by 255 if we want to convert to BGR255 format, or flip the channels + # if we want it to be in RGB in [0-1] range. + if cfg.INPUT.TO_BGR255: + to_bgr_transform = T.Lambda(lambda x: x * 255) + else: + to_bgr_transform = T.Lambda(lambda x: x[[2, 1, 0]]) + + normalize_transform = T.Normalize( + mean=cfg.INPUT.PIXEL_MEAN, std=cfg.INPUT.PIXEL_STD + ) + + transform = T.Compose( + [ + T.ToPILImage(), + T.Resize(self.min_image_size), + T.ToTensor(), + to_bgr_transform, + normalize_transform, + ] + ) + return transform + + def run_on_opencv_image(self, image): + """ + Arguments: + image (np.ndarray): an image as returned by OpenCV + + Returns: + prediction (BoxList): the detected objects. Additional information + of the detection properties can be found in the fields of + the BoxList via `prediction.fields()` + """ + predictions = self.compute_prediction(image) + top_predictions = self.select_top_predictions(predictions) + + # result = image.copy() + # if self.show_mask_heatmaps: + # return self.create_mask_montage(result, top_predictions) + # result = self.overlay_boxes(result, top_predictions) + # if self.cfg.MODEL.MASK_ON: + # result = self.overlay_mask(result, top_predictions) + # if self.cfg.MODEL.KEYPOINT_ON: + # result = self.overlay_keypoints(result, top_predictions) + # result = self.overlay_class_names(result, top_predictions) + + return top_predictions + + def compute_prediction(self, original_image): + """ + Arguments: + original_image (np.ndarray): an image as returned by OpenCV + + Returns: + prediction (BoxList): the detected objects. Additional information + of the detection properties can be found in the fields of + the BoxList via `prediction.fields()` + """ + # apply pre-processing to image + image = self.transforms(original_image) + # convert to an ImageList, padded so that it is divisible by + # cfg.DATALOADER.SIZE_DIVISIBILITY + image_list = to_image_list(image, self.cfg.DATALOADER.SIZE_DIVISIBILITY) + image_list = image_list.to(self.device) + # compute predictions + with torch.no_grad(): + predictions = self.model(image_list) + predictions = [o.to(self.cpu_device) for o in predictions] + + # always single image is passed at a time + prediction = predictions[0] + + # reshape prediction (a BoxList) into the original image size + height, width = original_image.shape[:-1] + prediction = prediction.resize((width, height)) + + if prediction.has_field("mask"): + # if we have masks, paste the masks in the right position + # in the image, as defined by the bounding boxes + masks = prediction.get_field("mask") + # always single image is passed at a time + masks = self.masker([masks], [prediction])[0] + prediction.add_field("mask", masks) + return prediction + + def select_top_predictions(self, predictions): + """ + Select only predictions which have a `score` > self.confidence_threshold, + and returns the predictions in descending order of score + + Arguments: + predictions (BoxList): the result of the computation by the model. + It should contain the field `scores`. + + Returns: + prediction (BoxList): the detected objects. Additional information + of the detection properties can be found in the fields of + the BoxList via `prediction.fields()` + """ + scores = predictions.get_field("scores") + keep = torch.nonzero(scores > self.confidence_threshold).squeeze(1) + predictions = predictions[keep] + scores = predictions.get_field("scores") + _, idx = scores.sort(0, descending=True) + return predictions[idx] + + def compute_colors_for_labels(self, labels): + """ + Simple function that adds fixed colors depending on the class + """ + colors = labels[:, None] * self.palette + colors = (colors % 255).numpy().astype("uint8") + return colors + + def overlay_boxes(self, image, predictions): + """ + Adds the predicted boxes on top of the image + + Arguments: + image (np.ndarray): an image as returned by OpenCV + predictions (BoxList): the result of the computation by the model. + It should contain the field `labels`. + """ + labels = predictions.get_field("labels") + boxes = predictions.bbox + + colors = self.compute_colors_for_labels(labels).tolist() + + for box, color in zip(boxes, colors): + box = box.to(torch.int64) + top_left, bottom_right = box[:2].tolist(), box[2:].tolist() + image = cv2.rectangle( + image, tuple(top_left), tuple(bottom_right), tuple(color), 2 + ) + + return image + + def overlay_mask(self, image, predictions): + """ + Adds the instances contours for each predicted object. + Each label has a different color. + + Arguments: + image (np.ndarray): an image as returned by OpenCV + predictions (BoxList): the result of the computation by the model. + It should contain the field `mask` and `labels`. + """ + masks = predictions.get_field("mask").numpy() + labels = predictions.get_field("labels") + + colors = self.compute_colors_for_labels(labels).tolist() + + for mask, color in zip(masks, colors): + thresh = mask[0, :, :, None] + contours, hierarchy = cv2_util.findContours( + thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE + ) + image = cv2.drawContours(image, contours, -1, color, 3) + + composite = image + + return composite + + def overlay_keypoints(self, image, predictions): + keypoints = predictions.get_field("keypoints") + kps = keypoints.keypoints + scores = keypoints.get_field("logits") + kps = torch.cat((kps[:, :, 0:2], scores[:, :, None]), dim=2).numpy() + for region in kps: + image = vis_keypoints(image, region.transpose((1, 0))) + return image + + def create_mask_montage(self, image, predictions): + """ + Create a montage showing the probability heatmaps for each one one of the + detected objects + + Arguments: + image (np.ndarray): an image as returned by OpenCV + predictions (BoxList): the result of the computation by the model. + It should contain the field `mask`. + """ + masks = predictions.get_field("mask") + masks_per_dim = self.masks_per_dim + masks = L.interpolate( + masks.float(), scale_factor=1 / masks_per_dim + ).byte() + height, width = masks.shape[-2:] + max_masks = masks_per_dim ** 2 + masks = masks[:max_masks] + # handle case where we have less detections than max_masks + if len(masks) < max_masks: + masks_padded = torch.zeros(max_masks, 1, height, width, dtype=torch.uint8) + masks_padded[: len(masks)] = masks + masks = masks_padded + masks = masks.reshape(masks_per_dim, masks_per_dim, height, width) + result = torch.zeros( + (masks_per_dim * height, masks_per_dim * width), dtype=torch.uint8 + ) + for y in range(masks_per_dim): + start_y = y * height + end_y = (y + 1) * height + for x in range(masks_per_dim): + start_x = x * width + end_x = (x + 1) * width + result[start_y:end_y, start_x:end_x] = masks[y, x] + return cv2.applyColorMap(result.numpy(), cv2.COLORMAP_JET) + + def overlay_class_names(self, image, predictions): + """ + Adds detected class names and scores in the positions defined by the + top-left corner of the predicted bounding box + + Arguments: + image (np.ndarray): an image as returned by OpenCV + predictions (BoxList): the result of the computation by the model. + It should contain the field `scores` and `labels`. + """ + scores = predictions.get_field("scores").tolist() + labels = predictions.get_field("labels").tolist() + labels = [self.CATEGORIES[i] for i in labels] + boxes = predictions.bbox + + template = "{}: {:.2f}" + for box, score, label in zip(boxes, scores, labels): + x, y = box[:2] + s = template.format(label, score) + cv2.putText( + image, s, (x, y), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2 + ) + + return image + + +import numpy as np +import matplotlib.pyplot as plt +from maskrcnn_benchmark.structures.keypoint import PersonKeypoints + + +def vis_keypoints(img, kps, kp_thresh=2, alpha=0.7): + """Visualizes keypoints (adapted from vis_one_image). + kps has shape (4, #keypoints) where 4 rows are (x, y, logit, prob). + """ + dataset_keypoints = PersonKeypoints.NAMES + kp_lines = PersonKeypoints.CONNECTIONS + + # Convert from plt 0-1 RGBA colors to 0-255 BGR colors for opencv. + cmap = plt.get_cmap('rainbow') + colors = [cmap(i) for i in np.linspace(0, 1, len(kp_lines) + 2)] + colors = [(c[2] * 255, c[1] * 255, c[0] * 255) for c in colors] + + # Perform the drawing on a copy of the image, to allow for blending. + kp_mask = np.copy(img) + + # Draw mid shoulder / mid hip first for better visualization. + mid_shoulder = ( + kps[:2, dataset_keypoints.index('right_shoulder')] + + kps[:2, dataset_keypoints.index('left_shoulder')]) / 2.0 + sc_mid_shoulder = np.minimum( + kps[2, dataset_keypoints.index('right_shoulder')], + kps[2, dataset_keypoints.index('left_shoulder')]) + mid_hip = ( + kps[:2, dataset_keypoints.index('right_hip')] + + kps[:2, dataset_keypoints.index('left_hip')]) / 2.0 + sc_mid_hip = np.minimum( + kps[2, dataset_keypoints.index('right_hip')], + kps[2, dataset_keypoints.index('left_hip')]) + nose_idx = dataset_keypoints.index('nose') + if sc_mid_shoulder > kp_thresh and kps[2, nose_idx] > kp_thresh: + cv2.line( + kp_mask, tuple(mid_shoulder), tuple(kps[:2, nose_idx]), + color=colors[len(kp_lines)], thickness=2, lineType=cv2.LINE_AA) + if sc_mid_shoulder > kp_thresh and sc_mid_hip > kp_thresh: + cv2.line( + kp_mask, tuple(mid_shoulder), tuple(mid_hip), + color=colors[len(kp_lines) + 1], thickness=2, lineType=cv2.LINE_AA) + + # Draw the keypoints. + for l in range(len(kp_lines)): + i1 = kp_lines[l][0] + i2 = kp_lines[l][1] + p1 = kps[0, i1], kps[1, i1] + p2 = kps[0, i2], kps[1, i2] + if kps[2, i1] > kp_thresh and kps[2, i2] > kp_thresh: + cv2.line( + kp_mask, p1, p2, + color=colors[l], thickness=2, lineType=cv2.LINE_AA) + if kps[2, i1] > kp_thresh: + cv2.circle( + kp_mask, p1, + radius=3, color=colors[l], thickness=-1, lineType=cv2.LINE_AA) + if kps[2, i2] > kp_thresh: + cv2.circle( + kp_mask, p2, + radius=3, color=colors[l], thickness=-1, lineType=cv2.LINE_AA) + + # Blend the keypoints. + return cv2.addWeighted(img, 1.0 - alpha, kp_mask, alpha, 0) diff --git a/demo/results.png b/demo/results.png new file mode 100755 index 0000000000000000000000000000000000000000..75578328de3ae69f66a0b5def87bccf33b85b804 Binary files /dev/null and b/demo/results.png differ diff --git a/demo/rpc_demo.py b/demo/rpc_demo.py new file mode 100644 index 0000000000000000000000000000000000000000..9d221e398774f441d2220b7736998de45df7e9e0 --- /dev/null +++ b/demo/rpc_demo.py @@ -0,0 +1,137 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import argparse +import glob +import json +import os +import random +import shutil +from collections import defaultdict + +import cv2 +from PIL import ImageFont +from tqdm import tqdm +from vizer.draw import draw_boxes + +from maskrcnn_benchmark.config import cfg +from predictor import COCODemo + + +def main(): + parser = argparse.ArgumentParser(description="DPNet Demo") + parser.add_argument( + "--config-file", + default="configs/e2e_faster_rcnn_R_101_FPN_1x_rpc_syn_render_density_map.yaml", + metavar="FILE", + help="path to config file", + ) + parser.add_argument( + "--images_dir", + required=True, + type=str, + help="path to images file", + ) + parser.add_argument( + "--save_dir", + default='rpc_results', + type=str, + help="path to images file", + ) + parser.add_argument( + "--confidence-threshold", + type=float, + default=0.7, + help="Minimum score for the prediction to be shown", + ) + parser.add_argument( + "--min-image-size", + type=int, + default=800, + help="Smallest size of the image to feed to the model. " + "Model was trained with 800, which gives best results", + ) + parser.add_argument( + "--show-mask-heatmaps", + dest="show_mask_heatmaps", + help="Show a heatmap probability for the top masks-per-dim masks", + action="store_true", + ) + parser.add_argument( + "--masks-per-dim", + type=int, + default=2, + help="Number of heatmaps per dimension to show", + ) + parser.add_argument( + "opts", + help="Modify model config options using the command-line", + default=None, + nargs=argparse.REMAINDER, + ) + + args = parser.parse_args() + + # load config from file and command-line arguments + cfg.merge_from_file(args.config_file) + cfg.merge_from_list(args.opts) + cfg.freeze() + + # prepare object that handles inference plus adds predictions on top of image + coco_demo = COCODemo( + cfg, + confidence_threshold=args.confidence_threshold, + show_mask_heatmaps=args.show_mask_heatmaps, + masks_per_dim=args.masks_per_dim, + min_image_size=args.min_image_size, + ) + if os.path.exists(args.save_dir): + shutil.rmtree(args.save_dir) + os.mkdir(args.save_dir) + + with open('/data7/lufficc/rpc/instances_test2019.json') as fid: + data = json.load(fid) + + images = {} + for x in data['images']: + images[x['id']] = x + + annotations = defaultdict(list) + for x in data['annotations']: + annotations[images[x['image_id']]['file_name']].append(x) + annotations = dict(annotations) + + counter = { + 'easy': 0, + 'medium': 0, + 'hard': 0, + } + + data_images = data['images'].copy() + random.shuffle(data_images) + FONT = ImageFont.truetype('/data7/lufficc/projects/DPNet/demo/arial.ttf', 8) + for image_ann in data_images: + if counter[image_ann['level']] >= 20: + continue + image_path = os.path.join(args.images_dir, image_ann['file_name']) + img = cv2.imread(image_path) + annotation = annotations[image_ann['file_name']] + prediction = coco_demo.run_on_opencv_image(img) + + new_size = (400, 400) + + img = cv2.resize(img, new_size) + prediction = prediction.resize(new_size) + + boxes = prediction.bbox.numpy() + labels = prediction.get_field('labels').numpy() + scores = prediction.get_field('scores').numpy() + + img = draw_boxes(img, boxes, labels, scores, COCODemo.CATEGORIES, width=2, font=FONT, alpha=0.4) + gt_labels = sorted([ann['category_id'] for ann in annotation]) + if gt_labels == sorted(labels.tolist()): + print('Get {}.'.format(image_ann['level'])) + cv2.imwrite(os.path.join(args.save_dir, image_ann['level'] + '_' + os.path.basename(image_path)), img) + counter[image_ann['level']] += 1 + + +if __name__ == "__main__": + main() diff --git a/docker/Dockerfile b/docker/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..39b508258a0748785c7cc2f33f09c11b6cafaa43 --- /dev/null +++ b/docker/Dockerfile @@ -0,0 +1,53 @@ +ARG CUDA="9.0" +ARG CUDNN="7" + +FROM nvidia/cuda:${CUDA}-cudnn${CUDNN}-devel-ubuntu16.04 + +RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections + +# install basics +RUN apt-get update -y \ + && apt-get install -y apt-utils git curl ca-certificates bzip2 cmake tree htop bmon iotop g++ \ + && apt-get install -y libglib2.0-0 libsm6 libxext6 libxrender-dev + +# Install Miniconda +RUN curl -so /miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh \ + && chmod +x /miniconda.sh \ + && /miniconda.sh -b -p /miniconda \ + && rm /miniconda.sh + +ENV PATH=/miniconda/bin:$PATH + +# Create a Python 3.6 environment +RUN /miniconda/bin/conda install -y conda-build \ + && /miniconda/bin/conda create -y --name py36 python=3.6.7 \ + && /miniconda/bin/conda clean -ya + +ENV CONDA_DEFAULT_ENV=py36 +ENV CONDA_PREFIX=/miniconda/envs/$CONDA_DEFAULT_ENV +ENV PATH=$CONDA_PREFIX/bin:$PATH +ENV CONDA_AUTO_UPDATE_CONDA=false + +RUN conda install -y ipython +RUN pip install ninja yacs cython matplotlib opencv-python + +# Install PyTorch 1.0 Nightly and OpenCV +RUN conda install -y pytorch-nightly -c pytorch \ + && conda clean -ya + +# Install TorchVision master +RUN git clone https://github.com/pytorch/vision.git \ + && cd vision \ + && python setup.py install + +# install pycocotools +RUN git clone https://github.com/cocodataset/cocoapi.git \ + && cd cocoapi/PythonAPI \ + && python setup.py build_ext install + +# install PyTorch Detection +RUN git clone https://github.com/facebookresearch/maskrcnn-benchmark.git \ + && cd maskrcnn-benchmark \ + && python setup.py build develop + +WORKDIR /maskrcnn-benchmark diff --git a/docker/docker-jupyter/Dockerfile b/docker/docker-jupyter/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..323727195e44bb60c7e80d0c6f5e4c6bb13154a9 --- /dev/null +++ b/docker/docker-jupyter/Dockerfile @@ -0,0 +1,67 @@ +ARG CUDA="9.0" +ARG CUDNN="7" + +FROM nvidia/cuda:${CUDA}-cudnn${CUDNN}-devel-ubuntu16.04 + +RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections + +# install basics +RUN apt-get update -y \ + && apt-get install -y apt-utils git curl ca-certificates bzip2 cmake tree htop bmon iotop g++ + +# Install Miniconda +RUN curl -so /miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh \ + && chmod +x /miniconda.sh \ + && /miniconda.sh -b -p /miniconda \ + && rm /miniconda.sh + +ENV PATH=/miniconda/bin:$PATH + +# Create a Python 3.6 environment +RUN /miniconda/bin/conda install -y conda-build \ + && /miniconda/bin/conda create -y --name py36 python=3.6.7 \ + && /miniconda/bin/conda clean -ya + +ENV CONDA_DEFAULT_ENV=py36 +ENV CONDA_PREFIX=/miniconda/envs/$CONDA_DEFAULT_ENV +ENV PATH=$CONDA_PREFIX/bin:$PATH +ENV CONDA_AUTO_UPDATE_CONDA=false + +RUN conda install -y ipython +RUN pip install ninja yacs cython matplotlib jupyter + +# Install PyTorch 1.0 Nightly and OpenCV +RUN conda install -y pytorch-nightly -c pytorch \ + && conda install -y opencv -c menpo \ + && conda clean -ya + +WORKDIR /root + +USER root + +RUN mkdir /notebooks + +WORKDIR /notebooks + +# Install TorchVision master +RUN git clone https://github.com/pytorch/vision.git \ + && cd vision \ + && python setup.py install + +# install pycocotools +RUN git clone https://github.com/cocodataset/cocoapi.git \ + && cd cocoapi/PythonAPI \ + && python setup.py build_ext install + +# install PyTorch Detection +RUN git clone https://github.com/facebookresearch/maskrcnn-benchmark.git \ + && cd maskrcnn-benchmark \ + && python setup.py build develop + +RUN jupyter notebook --generate-config + +ENV CONFIG_PATH="/root/.jupyter/jupyter_notebook_config.py" + +COPY "jupyter_notebook_config.py" ${CONFIG_PATH} + +ENTRYPOINT ["sh", "-c", "jupyter notebook --allow-root -y --no-browser --ip=0.0.0.0 --config=${CONFIG_PATH}"] diff --git a/docker/docker-jupyter/jupyter_notebook_config.py b/docker/docker-jupyter/jupyter_notebook_config.py new file mode 100644 index 0000000000000000000000000000000000000000..bd5494812303bee0471d0d7d0a4d259da6fbed4e --- /dev/null +++ b/docker/docker-jupyter/jupyter_notebook_config.py @@ -0,0 +1,18 @@ +import os +from IPython.lib import passwd + +#c = c # pylint:disable=undefined-variable +c = get_config() +c.NotebookApp.ip = '0.0.0.0' +c.NotebookApp.port = int(os.getenv('PORT', 8888)) +c.NotebookApp.open_browser = False + +# sets a password if PASSWORD is set in the environment +if 'PASSWORD' in os.environ: + password = os.environ['PASSWORD'] + if password: + c.NotebookApp.password = passwd(password) + else: + c.NotebookApp.password = '' + c.NotebookApp.token = '' + del os.environ['PASSWORD'] diff --git a/maskrcnn_benchmark/__init__.py b/maskrcnn_benchmark/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..5c7f19c6c00a4ac3f2f2bc66f892e44bcbd72612 --- /dev/null +++ b/maskrcnn_benchmark/__init__.py @@ -0,0 +1 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. diff --git a/maskrcnn_benchmark/config/__init__.py b/maskrcnn_benchmark/config/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..22a15023b1b06dad1f8c36924cdbb96bf1f5dc8d --- /dev/null +++ b/maskrcnn_benchmark/config/__init__.py @@ -0,0 +1,2 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +from .defaults import _C as cfg diff --git a/maskrcnn_benchmark/config/defaults.py b/maskrcnn_benchmark/config/defaults.py new file mode 100644 index 0000000000000000000000000000000000000000..9883683bedac6d52ed472f7ed6c70267da11d9b2 --- /dev/null +++ b/maskrcnn_benchmark/config/defaults.py @@ -0,0 +1,428 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import os + +from yacs.config import CfgNode as CN + +# ----------------------------------------------------------------------------- +# Convention about Training / Test specific parameters +# ----------------------------------------------------------------------------- +# Whenever an argument can be either used for training or for testing, the +# corresponding name will be post-fixed by a _TRAIN for a training parameter, +# or _TEST for a test-specific parameter. +# For example, the number of images during training will be +# IMAGES_PER_BATCH_TRAIN, while the number of images for testing will be +# IMAGES_PER_BATCH_TEST + +# ----------------------------------------------------------------------------- +# Config definition +# ----------------------------------------------------------------------------- + +_C = CN() + +_C.MODEL = CN() +_C.MODEL.RPN_ONLY = False +_C.MODEL.MASK_ON = False +_C.MODEL.RETINANET_ON = False +_C.MODEL.KEYPOINT_ON = False +_C.MODEL.HEATMAP_ON = False +_C.MODEL.DEVICE = "cuda" +_C.MODEL.META_ARCHITECTURE = "GeneralizedRCNN" +_C.MODEL.CLS_AGNOSTIC_BBOX_REG = False + +# If the WEIGHT starts with a catalog://, like :R-50, the code will look for +# the path in paths_catalog. Else, it will use it as the specified absolute +# path +_C.MODEL.WEIGHT = "" + +# ----------------------------------------------------------------------------- +# INPUT +# ----------------------------------------------------------------------------- +_C.INPUT = CN() +# Size of the smallest side of the image during training +_C.INPUT.MIN_SIZE_TRAIN = (800,) # (800,) +# Maximum size of the side of the image during training +_C.INPUT.MAX_SIZE_TRAIN = 1333 +# Size of the smallest side of the image during testing +_C.INPUT.MIN_SIZE_TEST = 800 +# Maximum size of the side of the image during testing +_C.INPUT.MAX_SIZE_TEST = 1333 +# Values to be used for image normalization +_C.INPUT.PIXEL_MEAN = [102.9801, 115.9465, 122.7717] +# Values to be used for image normalization +_C.INPUT.PIXEL_STD = [1., 1., 1.] +# Convert image to BGR format (for Caffe2 models), in range 0-255 +_C.INPUT.TO_BGR255 = True + +# ----------------------------------------------------------------------------- +# Dataset +# ----------------------------------------------------------------------------- +_C.DATASETS = CN() +# List of the dataset names for training, as present in paths_catalog.py +_C.DATASETS.TRAIN = () +# List of the dataset names for testing, as present in paths_catalog.py +_C.DATASETS.TEST = () + +# ----------------------------------------------------------------------------- +# DataLoader +# ----------------------------------------------------------------------------- +_C.DATALOADER = CN() +# Number of data loading threads +_C.DATALOADER.NUM_WORKERS = 4 +# If > 0, this enforces that each collated batch should have a size divisible +# by SIZE_DIVISIBILITY +_C.DATALOADER.SIZE_DIVISIBILITY = 0 +# If True, each batch should contain only images for which the aspect ratio +# is compatible. This groups portrait images together, and landscape images +# are not batched with portrait images. +_C.DATALOADER.ASPECT_RATIO_GROUPING = True + +# ---------------------------------------------------------------------------- # +# Backbone options +# ---------------------------------------------------------------------------- # +_C.MODEL.BACKBONE = CN() + +# The backbone conv body to use +# The string must match a function that is imported in modeling.model_builder +# (e.g., 'FPN.add_fpn_ResNet101_conv5_body' to specify a ResNet-101-FPN +# backbone) +_C.MODEL.BACKBONE.CONV_BODY = "R-50-C4" +_C.MODEL.BACKBONE.FUSE_FPN = False + +# Add StopGrad at a specified stage so the bottom layers are frozen +_C.MODEL.BACKBONE.FREEZE_CONV_BODY_AT = 2 +# GN for backbone +_C.MODEL.BACKBONE.USE_GN = False + +# ---------------------------------------------------------------------------- # +# FPN options +# ---------------------------------------------------------------------------- # +_C.MODEL.FPN = CN() +_C.MODEL.FPN.USE_GN = False +_C.MODEL.FPN.USE_RELU = False + +# ---------------------------------------------------------------------------- # +# Group Norm options +# ---------------------------------------------------------------------------- # +_C.MODEL.GROUP_NORM = CN() +# Number of dimensions per group in GroupNorm (-1 if using NUM_GROUPS) +_C.MODEL.GROUP_NORM.DIM_PER_GP = -1 +# Number of groups in GroupNorm (-1 if using DIM_PER_GP) +_C.MODEL.GROUP_NORM.NUM_GROUPS = 32 +# GroupNorm's small constant in the denominator +_C.MODEL.GROUP_NORM.EPSILON = 1e-5 + +# ---------------------------------------------------------------------------- # +# RPN options +# ---------------------------------------------------------------------------- # +_C.MODEL.RPN = CN() +_C.MODEL.RPN.USE_FPN = False +# Base RPN anchor sizes given in absolute pixels w.r.t. the scaled network input +_C.MODEL.RPN.ANCHOR_SIZES = (32, 64, 128, 256, 512) +# Stride of the feature map that RPN is attached. +# For FPN, number of strides should match number of scales +_C.MODEL.RPN.ANCHOR_STRIDE = (16,) +# RPN anchor aspect ratios +_C.MODEL.RPN.ASPECT_RATIOS = (0.5, 1.0, 2.0) +# Remove RPN anchors that go outside the image by RPN_STRADDLE_THRESH pixels +# Set to -1 or a large value, e.g. 100000, to disable pruning anchors +_C.MODEL.RPN.STRADDLE_THRESH = 0 +# Minimum overlap required between an anchor and ground-truth box for the +# (anchor, gt box) pair to be a positive example (IoU >= FG_IOU_THRESHOLD +# ==> positive RPN example) +_C.MODEL.RPN.FG_IOU_THRESHOLD = 0.7 +# Maximum overlap allowed between an anchor and ground-truth box for the +# (anchor, gt box) pair to be a negative examples (IoU < BG_IOU_THRESHOLD +# ==> negative RPN example) +_C.MODEL.RPN.BG_IOU_THRESHOLD = 0.3 +# Total number of RPN examples per image +_C.MODEL.RPN.BATCH_SIZE_PER_IMAGE = 256 +# Target fraction of foreground (positive) examples per RPN minibatch +_C.MODEL.RPN.POSITIVE_FRACTION = 0.5 +# Number of top scoring RPN proposals to keep before applying NMS +# When FPN is used, this is *per FPN level* (not total) +_C.MODEL.RPN.PRE_NMS_TOP_N_TRAIN = 12000 +_C.MODEL.RPN.PRE_NMS_TOP_N_TEST = 6000 +# Number of top scoring RPN proposals to keep after applying NMS +_C.MODEL.RPN.POST_NMS_TOP_N_TRAIN = 2000 +_C.MODEL.RPN.POST_NMS_TOP_N_TEST = 1000 +# NMS threshold used on RPN proposals +_C.MODEL.RPN.NMS_THRESH = 0.7 +# Proposal height and width both need to be greater than RPN_MIN_SIZE +# (a the scale used during training or inference) +_C.MODEL.RPN.MIN_SIZE = 0 +# Number of top scoring RPN proposals to keep after combining proposals from +# all FPN levels +_C.MODEL.RPN.FPN_POST_NMS_TOP_N_TRAIN = 2000 +_C.MODEL.RPN.FPN_POST_NMS_TOP_N_TEST = 2000 +# Custom rpn head, empty to use default conv or separable conv +_C.MODEL.RPN.RPN_HEAD = "SingleConvRPNHead" + +# ---------------------------------------------------------------------------- # +# ROI HEADS options +# ---------------------------------------------------------------------------- # +_C.MODEL.ROI_HEADS = CN() +_C.MODEL.ROI_HEADS.USE_FPN = False +# Overlap threshold for an RoI to be considered foreground (if >= FG_IOU_THRESHOLD) +_C.MODEL.ROI_HEADS.FG_IOU_THRESHOLD = 0.5 +# Overlap threshold for an RoI to be considered background +# (class = 0 if overlap in [0, BG_IOU_THRESHOLD)) +_C.MODEL.ROI_HEADS.BG_IOU_THRESHOLD = 0.5 +# Default weights on (dx, dy, dw, dh) for normalizing bbox regression targets +# These are empirically chosen to approximately lead to unit variance targets +_C.MODEL.ROI_HEADS.BBOX_REG_WEIGHTS = (10., 10., 5., 5.) +# RoI minibatch size *per image* (number of regions of interest [ROIs]) +# Total number of RoIs per training minibatch = +# TRAIN.BATCH_SIZE_PER_IM * TRAIN.IMS_PER_BATCH +# E.g., a common configuration is: 512 * 2 * 8 = 8192 +_C.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 512 +# Target fraction of RoI minibatch that is labeled foreground (i.e. class > 0) +_C.MODEL.ROI_HEADS.POSITIVE_FRACTION = 0.25 + +# Only used on test mode + +# Minimum score threshold (assuming scores in a [0, 1] range); a value chosen to +# balance obtaining high recall with not having too many low precision +# detections that will slow down inference post processing steps (like NMS) +_C.MODEL.ROI_HEADS.SCORE_THRESH = 0.05 +# Overlap threshold used for non-maximum suppression (suppress boxes with +# IoU >= this threshold) +_C.MODEL.ROI_HEADS.NMS = 0.5 +# Maximum number of detections to return per image (100 is based on the limit +# established for the COCO dataset) +_C.MODEL.ROI_HEADS.DETECTIONS_PER_IMG = 100 + +_C.MODEL.ROI_BOX_HEAD = CN() +_C.MODEL.ROI_BOX_HEAD.FEATURE_EXTRACTOR = "ResNet50Conv5ROIFeatureExtractor" +_C.MODEL.ROI_BOX_HEAD.PREDICTOR = "FastRCNNPredictor" +_C.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION = 14 +_C.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO = 0 +_C.MODEL.ROI_BOX_HEAD.POOLER_SCALES = (1.0 / 16,) +_C.MODEL.ROI_BOX_HEAD.NUM_CLASSES = 81 +# Hidden layer dimension when using an MLP for the RoI box head +_C.MODEL.ROI_BOX_HEAD.MLP_HEAD_DIM = 1024 +# GN +_C.MODEL.ROI_BOX_HEAD.USE_GN = False +# Dilation +_C.MODEL.ROI_BOX_HEAD.DILATION = 1 +_C.MODEL.ROI_BOX_HEAD.CONV_HEAD_DIM = 256 +_C.MODEL.ROI_BOX_HEAD.NUM_STACKED_CONVS = 4 + +_C.MODEL.ROI_MASK_HEAD = CN() +_C.MODEL.ROI_MASK_HEAD.FEATURE_EXTRACTOR = "ResNet50Conv5ROIFeatureExtractor" +_C.MODEL.ROI_MASK_HEAD.PREDICTOR = "MaskRCNNC4Predictor" +_C.MODEL.ROI_MASK_HEAD.POOLER_RESOLUTION = 14 +_C.MODEL.ROI_MASK_HEAD.POOLER_SAMPLING_RATIO = 0 +_C.MODEL.ROI_MASK_HEAD.POOLER_SCALES = (1.0 / 16,) +_C.MODEL.ROI_MASK_HEAD.MLP_HEAD_DIM = 1024 +_C.MODEL.ROI_MASK_HEAD.CONV_LAYERS = (256, 256, 256, 256) +_C.MODEL.ROI_MASK_HEAD.RESOLUTION = 14 +_C.MODEL.ROI_MASK_HEAD.SHARE_BOX_FEATURE_EXTRACTOR = True +# Whether or not resize and translate masks to the input image. +_C.MODEL.ROI_MASK_HEAD.POSTPROCESS_MASKS = False +_C.MODEL.ROI_MASK_HEAD.POSTPROCESS_MASKS_THRESHOLD = 0.5 +# Dilation +_C.MODEL.ROI_MASK_HEAD.DILATION = 1 +# GN +_C.MODEL.ROI_MASK_HEAD.USE_GN = False + +_C.MODEL.ROI_KEYPOINT_HEAD = CN() +_C.MODEL.ROI_KEYPOINT_HEAD.FEATURE_EXTRACTOR = "KeypointRCNNFeatureExtractor" +_C.MODEL.ROI_KEYPOINT_HEAD.PREDICTOR = "KeypointRCNNPredictor" +_C.MODEL.ROI_KEYPOINT_HEAD.POOLER_RESOLUTION = 14 +_C.MODEL.ROI_KEYPOINT_HEAD.POOLER_SAMPLING_RATIO = 0 +_C.MODEL.ROI_KEYPOINT_HEAD.POOLER_SCALES = (1.0 / 16,) +_C.MODEL.ROI_KEYPOINT_HEAD.MLP_HEAD_DIM = 1024 +_C.MODEL.ROI_KEYPOINT_HEAD.CONV_LAYERS = tuple(512 for _ in range(8)) +_C.MODEL.ROI_KEYPOINT_HEAD.RESOLUTION = 14 +_C.MODEL.ROI_KEYPOINT_HEAD.NUM_CLASSES = 17 +_C.MODEL.ROI_KEYPOINT_HEAD.SHARE_BOX_FEATURE_EXTRACTOR = True + +# ---------------------------------------------------------------------------- # +# ResNe[X]t options (ResNets = {ResNet, ResNeXt} +# Note that parts of a resnet may be used for both the backbone and the head +# These options apply to both +# ---------------------------------------------------------------------------- # +_C.MODEL.RESNETS = CN() + +# Number of groups to use; 1 ==> ResNet; > 1 ==> ResNeXt +_C.MODEL.RESNETS.NUM_GROUPS = 1 + +# Baseline width of each group +_C.MODEL.RESNETS.WIDTH_PER_GROUP = 64 + +# Place the stride 2 conv on the 1x1 filter +# Use True only for the original MSRA ResNet; use False for C2 and Torch models +_C.MODEL.RESNETS.STRIDE_IN_1X1 = True + +# Residual transformation function +_C.MODEL.RESNETS.TRANS_FUNC = "BottleneckWithFixedBatchNorm" +# ResNet's stem function (conv1 and pool1) +_C.MODEL.RESNETS.STEM_FUNC = "StemWithFixedBatchNorm" + +# Apply dilation in stage "res5" +_C.MODEL.RESNETS.RES5_DILATION = 1 + +_C.MODEL.RESNETS.BACKBONE_OUT_CHANNELS = 256 * 4 +_C.MODEL.RESNETS.RES2_OUT_CHANNELS = 256 +_C.MODEL.RESNETS.STEM_OUT_CHANNELS = 64 + +# ---------------------------------------------------------------------------- # +# Density Head Options +# ---------------------------------------------------------------------------- # +_C.MODEL.DENSITY_HEAD = CN() +_C.MODEL.DENSITY_HEAD.NUM_CLASSES = 1 +_C.MODEL.DENSITY_HEAD.FPN_LEVEL = 1 # (p2, p3, p4, p5, p6), 1 for p3 +_C.MODEL.DENSITY_HEAD.FPN_LEVEL_STRIDE = 1.0 / 8 +_C.MODEL.DENSITY_HEAD.INTERPOLATE_MODE = 'bilinear' + +# ---------------------------------------------------------------------------- # +# RetinaNet Options (Follow the Detectron version) +# ---------------------------------------------------------------------------- # +_C.MODEL.RETINANET = CN() + +# This is the number of foreground classes and background. +_C.MODEL.RETINANET.NUM_CLASSES = 81 + +# Anchor aspect ratios to use +_C.MODEL.RETINANET.ANCHOR_SIZES = (32, 64, 128, 256, 512) +_C.MODEL.RETINANET.ASPECT_RATIOS = (0.5, 1.0, 2.0) +_C.MODEL.RETINANET.ANCHOR_STRIDES = (8, 16, 32, 64, 128) +_C.MODEL.RETINANET.STRADDLE_THRESH = 0 + +# Anchor scales per octave +_C.MODEL.RETINANET.OCTAVE = 2.0 +_C.MODEL.RETINANET.SCALES_PER_OCTAVE = 3 + +# Use C5 or P5 to generate P6 +_C.MODEL.RETINANET.USE_C5 = True + +# Convolutions to use in the cls and bbox tower +# NOTE: this doesn't include the last conv for logits +_C.MODEL.RETINANET.NUM_CONVS = 4 + +# Weight for bbox_regression loss +_C.MODEL.RETINANET.BBOX_REG_WEIGHT = 4.0 + +# Smooth L1 loss beta for bbox regression +_C.MODEL.RETINANET.BBOX_REG_BETA = 0.11 + +# During inference, #locs to select based on cls score before NMS is performed +# per FPN level +_C.MODEL.RETINANET.PRE_NMS_TOP_N = 1000 + +# IoU overlap ratio for labeling an anchor as positive +# Anchors with >= iou overlap are labeled positive +_C.MODEL.RETINANET.FG_IOU_THRESHOLD = 0.5 + +# IoU overlap ratio for labeling an anchor as negative +# Anchors with < iou overlap are labeled negative +_C.MODEL.RETINANET.BG_IOU_THRESHOLD = 0.4 + +# Focal loss parameter: alpha +_C.MODEL.RETINANET.LOSS_ALPHA = 0.25 + +# Focal loss parameter: gamma +_C.MODEL.RETINANET.LOSS_GAMMA = 2.0 + +# Prior prob for the positives at the beginning of training. This is used to set +# the bias init for the logits layer +_C.MODEL.RETINANET.PRIOR_PROB = 0.01 + +# Inference cls score threshold, anchors with score > INFERENCE_TH are +# considered for inference +_C.MODEL.RETINANET.INFERENCE_TH = 0.05 + +# NMS threshold used in RetinaNet +_C.MODEL.RETINANET.NMS_TH = 0.4 + +# ---------------------------------------------------------------------------- # +# FBNet options +# ---------------------------------------------------------------------------- # +_C.MODEL.FBNET = CN() +_C.MODEL.FBNET.ARCH = "default" +# custom arch +_C.MODEL.FBNET.ARCH_DEF = "" +_C.MODEL.FBNET.BN_TYPE = "bn" +_C.MODEL.FBNET.SCALE_FACTOR = 1.0 +# the output channels will be divisible by WIDTH_DIVISOR +_C.MODEL.FBNET.WIDTH_DIVISOR = 1 +_C.MODEL.FBNET.DW_CONV_SKIP_BN = True +_C.MODEL.FBNET.DW_CONV_SKIP_RELU = True + +# > 0 scale, == 0 skip, < 0 same dimension +_C.MODEL.FBNET.DET_HEAD_LAST_SCALE = 1.0 +_C.MODEL.FBNET.DET_HEAD_BLOCKS = [] +# overwrite the stride for the head, 0 to use original value +_C.MODEL.FBNET.DET_HEAD_STRIDE = 0 + +# > 0 scale, == 0 skip, < 0 same dimension +_C.MODEL.FBNET.KPTS_HEAD_LAST_SCALE = 0.0 +_C.MODEL.FBNET.KPTS_HEAD_BLOCKS = [] +# overwrite the stride for the head, 0 to use original value +_C.MODEL.FBNET.KPTS_HEAD_STRIDE = 0 + +# > 0 scale, == 0 skip, < 0 same dimension +_C.MODEL.FBNET.MASK_HEAD_LAST_SCALE = 0.0 +_C.MODEL.FBNET.MASK_HEAD_BLOCKS = [] +# overwrite the stride for the head, 0 to use original value +_C.MODEL.FBNET.MASK_HEAD_STRIDE = 0 + +# 0 to use all blocks defined in arch_def +_C.MODEL.FBNET.RPN_HEAD_BLOCKS = 0 +_C.MODEL.FBNET.RPN_BN_TYPE = "" + +# ---------------------------------------------------------------------------- # +# Solver +# ---------------------------------------------------------------------------- # +_C.SOLVER = CN() +_C.SOLVER.MAX_ITER = 40000 +_C.SOLVER.CROSS_TRAIN_STEPS = 10 +_C.SOLVER.CROSS_TRAIN_DATA_RATIO = 0 +_C.SOLVER.ITER_PER_STEP = 10000 + +_C.SOLVER.BASE_LR = 0.001 +_C.SOLVER.BIAS_LR_FACTOR = 2 + +_C.SOLVER.MOMENTUM = 0.9 + +_C.SOLVER.WEIGHT_DECAY = 0.0005 +_C.SOLVER.WEIGHT_DECAY_BIAS = 0 + +_C.SOLVER.GAMMA = 0.1 +_C.SOLVER.STEPS = (30000,) + +_C.SOLVER.WARMUP_FACTOR = 1.0 / 3 +_C.SOLVER.WARMUP_ITERS = 500 +_C.SOLVER.WARMUP_METHOD = "linear" + +_C.SOLVER.CHECKPOINT_PERIOD = 2500 + +# Number of images per batch +# This is global, so if we have 8 GPUs and IMS_PER_BATCH = 16, each GPU will +# see 2 images per batch +_C.SOLVER.IMS_PER_BATCH = 16 + +# ---------------------------------------------------------------------------- # +# Specific test options +# ---------------------------------------------------------------------------- # +_C.TEST = CN() +_C.TEST.EXPECTED_RESULTS = [] +_C.TEST.EXPECTED_RESULTS_SIGMA_TOL = 4 +_C.TEST.GENERATE_PSEUDO_LABELS = False +_C.TEST.PSEUDO_LABELS_ANN_FILE = '' +_C.TEST.TEST_IMAGES_DIR = '/data7/lufficc/rpc/test2019/' +_C.TEST.TEST_ANN_FILE = '/data7/lufficc/rpc/instances_test2019.json' +_C.TEST.USE_GROUND_TRUTH = False +# Number of images per batch +# This is global, so if we have 8 GPUs and IMS_PER_BATCH = 16, each GPU will +# see 2 images per batch +_C.TEST.IMS_PER_BATCH = 8 +# Number of detections per image +_C.TEST.DETECTIONS_PER_IMG = 100 + +# ---------------------------------------------------------------------------- # +# Misc options +# ---------------------------------------------------------------------------- # +_C.OUTPUT_DIR = "." + +_C.PATHS_CATALOG = os.path.join(os.path.dirname(__file__), "paths_catalog.py") diff --git a/maskrcnn_benchmark/config/paths_catalog.py b/maskrcnn_benchmark/config/paths_catalog.py new file mode 100644 index 0000000000000000000000000000000000000000..278ec644e7369a9286a31b59a08b80d3f3814265 --- /dev/null +++ b/maskrcnn_benchmark/config/paths_catalog.py @@ -0,0 +1,333 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +"""Centralized catalog of paths.""" + +import os + + +class DatasetCatalog(object): + DATA_DIR = "datasets" + DATASETS = { + "coco_2017_train": { + "img_dir": "coco/train2017", + "ann_file": "coco/annotations/instances_train2017.json" + }, + "coco_2017_val": { + "img_dir": "coco/val2017", + "ann_file": "coco/annotations/instances_val2017.json" + }, + "coco_2017_unlabel": { + # 'pseudo_labels_file': None, + "pseudo_labels_file": '/data7/lufficc/projects/DPNet/outputs_coco_density/inference/coco_2017_unlabel/pseudo_labeling.json', + "img_dir": "coco/unlabeled2017", + "ann_file": "coco/annotations/image_info_unlabeled2017.json", + "use_density_map": False, + }, + "coco_2014_train": { + "img_dir": "coco/train2014", + "ann_file": "coco/annotations/instances_train2014.json" + }, + "coco_2014_train_density": { + "use_density_map": True, + "img_dir": "coco/train2014", + "ann_file": "coco/annotations/instances_train2014.json" + }, + "coco_2014_val": { + "img_dir": "coco/val2014", + "ann_file": "coco/annotations/instances_val2014.json" + }, + "coco_2014_minival": { + "img_dir": "coco/val2014", + "ann_file": "coco/annotations/instances_minival2014.json" + }, + "coco_2014_minival_density": { + "use_density_map": False, + "img_dir": "coco/val2014", + "ann_file": "coco/annotations/instances_minival2014.json" + }, + "coco_2014_valminusminival": { + "img_dir": "coco/val2014", + "ann_file": "coco/annotations/instances_valminusminival2014.json" + }, + "coco_2014_valminusminival_density": { + "use_density_map": True, + "img_dir": "coco/val2014", + "ann_file": "coco/annotations/instances_valminusminival2014.json" + }, + "keypoints_coco_2014_train": { + "img_dir": "coco/train2014", + "ann_file": "coco/annotations/person_keypoints_train2014.json", + }, + "keypoints_coco_2014_val": { + "img_dir": "coco/val2014", + "ann_file": "coco/annotations/person_keypoints_val2014.json" + }, + "keypoints_coco_2014_minival": { + "img_dir": "coco/val2014", + "ann_file": "coco/annotations/person_keypoints_minival2014.json", + }, + "keypoints_coco_2014_valminusminival": { + "img_dir": "coco/val2014", + "ann_file": "coco/annotations/person_keypoints_valminusminival2014.json", + }, + "voc_2007_train": { + "data_dir": "voc/VOC2007", + "split": "train" + }, + "voc_2007_train_cocostyle": { + "img_dir": "voc/VOC2007/JPEGImages", + "ann_file": "voc/VOC2007/Annotations/pascal_train2007.json" + }, + "voc_2007_val": { + "data_dir": "voc/VOC2007", + "split": "val" + }, + "voc_2007_val_cocostyle": { + "img_dir": "voc/VOC2007/JPEGImages", + "ann_file": "voc/VOC2007/Annotations/pascal_val2007.json" + }, + "voc_2007_test": { + "data_dir": "voc/VOC2007", + "split": "test" + }, + "voc_2007_test_cocostyle": { + "img_dir": "voc/VOC2007/JPEGImages", + "ann_file": "voc/VOC2007/Annotations/pascal_test2007.json" + }, + "voc_2012_train": { + "data_dir": "voc/VOC2012", + "split": "train" + }, + "voc_2012_train_cocostyle": { + "img_dir": "voc/VOC2012/JPEGImages", + "ann_file": "voc/VOC2012/Annotations/pascal_train2012.json" + }, + "voc_2012_val": { + "data_dir": "voc/VOC2012", + "split": "val" + }, + "voc_2012_val_cocostyle": { + "img_dir": "voc/VOC2012/JPEGImages", + "ann_file": "voc/VOC2012/Annotations/pascal_val2012.json" + }, + "voc_2012_test": { + "data_dir": "voc/VOC2012", + "split": "test" + # PASCAL VOC2012 doesn't made the test annotations available, so there's no json annotation + }, + "cityscapes_fine_instanceonly_seg_train_cocostyle": { + "img_dir": "cityscapes/images", + "ann_file": "cityscapes/annotations/instancesonly_filtered_gtFine_train.json" + }, + "cityscapes_fine_instanceonly_seg_val_cocostyle": { + "img_dir": "cityscapes/images", + "ann_file": "cityscapes/annotations/instancesonly_filtered_gtFine_val.json" + }, + "cityscapes_fine_instanceonly_seg_test_cocostyle": { + "img_dir": "cityscapes/images", + "ann_file": "cityscapes/annotations/instancesonly_filtered_gtFine_test.json" + }, + # ------------------------------------------------- + # -----------------RPC eval dataset---------------- + # ------------------------------------------------- + "rpc_2019_test": { + "images_dir": "/data7/lufficc/rpc/test2019/", + "ann_file": '/data7/lufficc/rpc/instances_test2019.json', + }, + "rpc_2019_val": { + "images_dir": "/data7/lufficc/rpc/val2019/", + "ann_file": '/data7/lufficc/rpc/instances_val2019.json', + }, + # ------------------------------------------------- + # -----------------RPC train dataset--------------- + # ------------------------------------------------- + "rpc_2019_train_syn": { + "images_dir": "/data7/lufficc/rpc/synthesize_v10_masks_density_map_0_45_threshold", + "ann_file": '/data7/lufficc/rpc/synthesize_v10_masks_density_map_0_45_threshold.json', + 'use_density_map': False, + 'rendered': False, + }, + "rpc_2019_train_render": { + "images_dir": "/data7/lufficc/rpc/synthesize_v10_masks_density_map_0_45_threshold_cyclegan", + "ann_file": '/data7/lufficc/rpc/synthesize_v10_masks_density_map_0_45_threshold.json', + 'use_density_map': False, + 'rendered': True, + }, + "rpc_2019_train_render_density_map": { + "images_dir": "/data7/lufficc/rpc/synthesize_v10_masks_density_map_0_45_threshold_cyclegan", + "ann_file": '/data7/lufficc/rpc/synthesize_v10_masks_density_map_0_45_threshold.json', + 'use_density_map': True, + 'rendered': True, + }, + "rpc_2019_train_syn_density_map": { + "images_dir": "/data7/lufficc/rpc/synthesize_v10_masks_density_map_0_45_threshold", + "ann_file": '/data7/lufficc/rpc/synthesize_v10_masks_density_map_0_45_threshold.json', + 'use_density_map': True, + 'rendered': False, + }, + "rpc_2019_pseudo": { + "images_dir": "/data7/lufficc/rpc/test2019/", + "ann_file": '/data7/lufficc/projects/DPNet/outputs_rpc_2019_train_render_final_density_map/inference' + '/rpc_2019_test/pseudo_labeling.json', + }, + "rpc_2019_pseudo_density": { + 'use_density_map': True, + "images_dir": "/data7/lufficc/rpc/test2019/", + "ann_file": '/data7/lufficc/projects/DPNet/outputs_rpc_2019_train_render_final_density_map/inference' + '/rpc_2019_test/pseudo_labeling.json', + }, + "rpc_2019_instance_select": { + "images_dir": "/data7/lufficc/rpc/test2019/", + "ann_file": '/data7/lufficc/projects/DPNet/outputs_rpc_2019_train_render_final_density_map_cross_finetune_paper/' + 'inference/rpc_2019_test/bbox_results.json', + }, + 'rpc_images': { + + } + } + + @staticmethod + def get(name): + if name in ('coco_2014_train_density', 'coco_2014_valminusminival_density', 'coco_2014_minival_density'): + data_dir = DatasetCatalog.DATA_DIR + attrs = DatasetCatalog.DATASETS[name] + args = dict( + root=os.path.join(data_dir, attrs["img_dir"]), + ann_file=os.path.join(data_dir, attrs["ann_file"]), + use_density_map=attrs['use_density_map'], + ) + return dict( + factory="COCODensityDataset", + args=args, + ) + elif name in ('coco_2017_unlabel',): + data_dir = DatasetCatalog.DATA_DIR + attrs = DatasetCatalog.DATASETS[name] + args = dict( + img_dir=os.path.join(data_dir, attrs["img_dir"]), + ann_file=os.path.join(data_dir, attrs["ann_file"]), + use_density_map=attrs['use_density_map'], + pseudo_labels_file=attrs['pseudo_labels_file'], + ) + return dict( + factory="CocoUnlabelDataset", + args=args, + ) + elif "coco" in name: + data_dir = DatasetCatalog.DATA_DIR + attrs = DatasetCatalog.DATASETS[name] + args = dict( + root=os.path.join(data_dir, attrs["img_dir"]), + ann_file=os.path.join(data_dir, attrs["ann_file"]), + ) + return dict( + factory="COCODataset", + args=args, + ) + elif "voc" in name: + data_dir = DatasetCatalog.DATA_DIR + attrs = DatasetCatalog.DATASETS[name] + args = dict( + data_dir=os.path.join(data_dir, attrs["data_dir"]), + split=attrs["split"], + ) + return dict( + factory="PascalVOCDataset", + args=args, + ) + elif "rpc_2019_train" in name: + attrs = DatasetCatalog.DATASETS[name] + args = dict(attrs) + return dict( + factory="RPCDataset", + args=args, + ) + elif name in ('rpc_2019_test', 'rpc_2019_val'): + attrs = DatasetCatalog.DATASETS[name] + args = dict(attrs) + return dict( + factory="RPCTestDataset", + args=args, + ) + elif name in ('rpc_2019_pseudo',): + attrs = DatasetCatalog.DATASETS[name] + args = dict(attrs) + return dict( + factory="RPCPseudoDataset", + args=args, + ) + elif name in ('rpc_2019_instance_select',): + attrs = DatasetCatalog.DATASETS[name] + args = dict(attrs) + return dict( + factory="RPCInstanceSelectDataset", + args=args, + ) + elif name == 'rpc_images': + attrs = DatasetCatalog.DATASETS[name] + args = dict(attrs) + return dict( + factory="ImagesDataset", + args=args, + ) + raise RuntimeError("Dataset not available: {}".format(name)) + + +class ModelCatalog(object): + S3_C2_DETECTRON_URL = "https://dl.fbaipublicfiles.com/detectron" + C2_IMAGENET_MODELS = { + "MSRA/R-50": "ImageNetPretrained/MSRA/R-50.pkl", + "MSRA/R-50-GN": "ImageNetPretrained/47261647/R-50-GN.pkl", + "MSRA/R-101": "ImageNetPretrained/MSRA/R-101.pkl", + "MSRA/R-101-GN": "ImageNetPretrained/47592356/R-101-GN.pkl", + "FAIR/20171220/X-101-32x8d": "ImageNetPretrained/20171220/X-101-32x8d.pkl", + } + + C2_DETECTRON_SUFFIX = "output/train/{}coco_2014_train%3A{}coco_2014_valminusminival/generalized_rcnn/model_final.pkl" + C2_DETECTRON_MODELS = { + "35857197/e2e_faster_rcnn_R-50-C4_1x": "01_33_49.iAX0mXvW", + "35857345/e2e_faster_rcnn_R-50-FPN_1x": "01_36_30.cUF7QR7I", + "35857890/e2e_faster_rcnn_R-101-FPN_1x": "01_38_50.sNxI7sX7", + "36761737/e2e_faster_rcnn_X-101-32x8d-FPN_1x": "06_31_39.5MIHi1fZ", + "35858791/e2e_mask_rcnn_R-50-C4_1x": "01_45_57.ZgkA7hPB", + "35858933/e2e_mask_rcnn_R-50-FPN_1x": "01_48_14.DzEQe4wC", + "35861795/e2e_mask_rcnn_R-101-FPN_1x": "02_31_37.KqyEK4tT", + "36761843/e2e_mask_rcnn_X-101-32x8d-FPN_1x": "06_35_59.RZotkLKI", + "37129812/e2e_mask_rcnn_X-152-32x8d-FPN-IN5k_1.44x": "09_35_36.8pzTQKYK", + # keypoints + "37697547/e2e_keypoint_rcnn_R-50-FPN_1x": "08_42_54.kdzV35ao" + } + + @staticmethod + def get(name): + if name.startswith("Caffe2Detectron/COCO"): + return ModelCatalog.get_c2_detectron_12_2017_baselines(name) + if name.startswith("ImageNetPretrained"): + return ModelCatalog.get_c2_imagenet_pretrained(name) + raise RuntimeError("model not present in the catalog {}".format(name)) + + @staticmethod + def get_c2_imagenet_pretrained(name): + prefix = ModelCatalog.S3_C2_DETECTRON_URL + name = name[len("ImageNetPretrained/"):] + name = ModelCatalog.C2_IMAGENET_MODELS[name] + url = "/".join([prefix, name]) + return url + + @staticmethod + def get_c2_detectron_12_2017_baselines(name): + # Detectron C2 models are stored following the structure + # prefix//2012_2017_baselines/.yaml./suffix + # we use as identifiers in the catalog Caffe2Detectron/COCO// + prefix = ModelCatalog.S3_C2_DETECTRON_URL + dataset_tag = "keypoints_" if "keypoint" in name else "" + suffix = ModelCatalog.C2_DETECTRON_SUFFIX.format(dataset_tag, dataset_tag) + # remove identification prefix + name = name[len("Caffe2Detectron/COCO/"):] + # split in and + model_id, model_name = name.split("/") + # parsing to make it match the url address from the Caffe2 models + model_name = "{}.yaml".format(model_name) + signature = ModelCatalog.C2_DETECTRON_MODELS[name] + unique_name = ".".join([model_name, signature]) + url = "/".join([prefix, model_id, "12_2017_baselines", unique_name, suffix]) + return url diff --git a/maskrcnn_benchmark/csrc/ROIAlign.h b/maskrcnn_benchmark/csrc/ROIAlign.h new file mode 100644 index 0000000000000000000000000000000000000000..3907deab2a750a9f83f0f3ef38fee279c1445c61 --- /dev/null +++ b/maskrcnn_benchmark/csrc/ROIAlign.h @@ -0,0 +1,46 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +#pragma once + +#include "cpu/vision.h" + +#ifdef WITH_CUDA +#include "cuda/vision.h" +#endif + +// Interface for Python +at::Tensor ROIAlign_forward(const at::Tensor& input, + const at::Tensor& rois, + const float spatial_scale, + const int pooled_height, + const int pooled_width, + const int sampling_ratio) { + if (input.type().is_cuda()) { +#ifdef WITH_CUDA + return ROIAlign_forward_cuda(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio); +#else + AT_ERROR("Not compiled with GPU support"); +#endif + } + return ROIAlign_forward_cpu(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio); +} + +at::Tensor ROIAlign_backward(const at::Tensor& grad, + const at::Tensor& rois, + const float spatial_scale, + const int pooled_height, + const int pooled_width, + const int batch_size, + const int channels, + const int height, + const int width, + const int sampling_ratio) { + if (grad.type().is_cuda()) { +#ifdef WITH_CUDA + return ROIAlign_backward_cuda(grad, rois, spatial_scale, pooled_height, pooled_width, batch_size, channels, height, width, sampling_ratio); +#else + AT_ERROR("Not compiled with GPU support"); +#endif + } + AT_ERROR("Not implemented on the CPU"); +} + diff --git a/maskrcnn_benchmark/csrc/ROIPool.h b/maskrcnn_benchmark/csrc/ROIPool.h new file mode 100644 index 0000000000000000000000000000000000000000..200fd7390b4629747f0ea9e16c0823ac5f099ac1 --- /dev/null +++ b/maskrcnn_benchmark/csrc/ROIPool.h @@ -0,0 +1,48 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +#pragma once + +#include "cpu/vision.h" + +#ifdef WITH_CUDA +#include "cuda/vision.h" +#endif + + +std::tuple ROIPool_forward(const at::Tensor& input, + const at::Tensor& rois, + const float spatial_scale, + const int pooled_height, + const int pooled_width) { + if (input.type().is_cuda()) { +#ifdef WITH_CUDA + return ROIPool_forward_cuda(input, rois, spatial_scale, pooled_height, pooled_width); +#else + AT_ERROR("Not compiled with GPU support"); +#endif + } + AT_ERROR("Not implemented on the CPU"); +} + +at::Tensor ROIPool_backward(const at::Tensor& grad, + const at::Tensor& input, + const at::Tensor& rois, + const at::Tensor& argmax, + const float spatial_scale, + const int pooled_height, + const int pooled_width, + const int batch_size, + const int channels, + const int height, + const int width) { + if (grad.type().is_cuda()) { +#ifdef WITH_CUDA + return ROIPool_backward_cuda(grad, input, rois, argmax, spatial_scale, pooled_height, pooled_width, batch_size, channels, height, width); +#else + AT_ERROR("Not compiled with GPU support"); +#endif + } + AT_ERROR("Not implemented on the CPU"); +} + + + diff --git a/maskrcnn_benchmark/csrc/SigmoidFocalLoss.h b/maskrcnn_benchmark/csrc/SigmoidFocalLoss.h new file mode 100644 index 0000000000000000000000000000000000000000..308861e44774dffd89b3f5ebff7cc6c5491fe3a5 --- /dev/null +++ b/maskrcnn_benchmark/csrc/SigmoidFocalLoss.h @@ -0,0 +1,41 @@ +#pragma once + +#include "cpu/vision.h" + +#ifdef WITH_CUDA +#include "cuda/vision.h" +#endif + +// Interface for Python +at::Tensor SigmoidFocalLoss_forward( + const at::Tensor& logits, + const at::Tensor& targets, + const int num_classes, + const float gamma, + const float alpha) { + if (logits.type().is_cuda()) { +#ifdef WITH_CUDA + return SigmoidFocalLoss_forward_cuda(logits, targets, num_classes, gamma, alpha); +#else + AT_ERROR("Not compiled with GPU support"); +#endif + } + AT_ERROR("Not implemented on the CPU"); +} + +at::Tensor SigmoidFocalLoss_backward( + const at::Tensor& logits, + const at::Tensor& targets, + const at::Tensor& d_losses, + const int num_classes, + const float gamma, + const float alpha) { + if (logits.type().is_cuda()) { +#ifdef WITH_CUDA + return SigmoidFocalLoss_backward_cuda(logits, targets, d_losses, num_classes, gamma, alpha); +#else + AT_ERROR("Not compiled with GPU support"); +#endif + } + AT_ERROR("Not implemented on the CPU"); +} diff --git a/maskrcnn_benchmark/csrc/cpu/ROIAlign_cpu.cpp b/maskrcnn_benchmark/csrc/cpu/ROIAlign_cpu.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d35aedf27ea581b9241d44b87dcca2e901b5064e --- /dev/null +++ b/maskrcnn_benchmark/csrc/cpu/ROIAlign_cpu.cpp @@ -0,0 +1,257 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +#include "cpu/vision.h" + +// implementation taken from Caffe2 +template +struct PreCalc { + int pos1; + int pos2; + int pos3; + int pos4; + T w1; + T w2; + T w3; + T w4; +}; + +template +void pre_calc_for_bilinear_interpolate( + const int height, + const int width, + const int pooled_height, + const int pooled_width, + const int iy_upper, + const int ix_upper, + T roi_start_h, + T roi_start_w, + T bin_size_h, + T bin_size_w, + int roi_bin_grid_h, + int roi_bin_grid_w, + std::vector>& pre_calc) { + int pre_calc_index = 0; + for (int ph = 0; ph < pooled_height; ph++) { + for (int pw = 0; pw < pooled_width; pw++) { + for (int iy = 0; iy < iy_upper; iy++) { + const T yy = roi_start_h + ph * bin_size_h + + static_cast(iy + .5f) * bin_size_h / + static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 + for (int ix = 0; ix < ix_upper; ix++) { + const T xx = roi_start_w + pw * bin_size_w + + static_cast(ix + .5f) * bin_size_w / + static_cast(roi_bin_grid_w); + + T x = xx; + T y = yy; + // deal with: inverse elements are out of feature map boundary + if (y < -1.0 || y > height || x < -1.0 || x > width) { + // empty + PreCalc pc; + pc.pos1 = 0; + pc.pos2 = 0; + pc.pos3 = 0; + pc.pos4 = 0; + pc.w1 = 0; + pc.w2 = 0; + pc.w3 = 0; + pc.w4 = 0; + pre_calc[pre_calc_index] = pc; + pre_calc_index += 1; + continue; + } + + if (y <= 0) { + y = 0; + } + if (x <= 0) { + x = 0; + } + + int y_low = (int)y; + int x_low = (int)x; + int y_high; + int x_high; + + if (y_low >= height - 1) { + y_high = y_low = height - 1; + y = (T)y_low; + } else { + y_high = y_low + 1; + } + + if (x_low >= width - 1) { + x_high = x_low = width - 1; + x = (T)x_low; + } else { + x_high = x_low + 1; + } + + T ly = y - y_low; + T lx = x - x_low; + T hy = 1. - ly, hx = 1. - lx; + T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; + + // save weights and indeces + PreCalc pc; + pc.pos1 = y_low * width + x_low; + pc.pos2 = y_low * width + x_high; + pc.pos3 = y_high * width + x_low; + pc.pos4 = y_high * width + x_high; + pc.w1 = w1; + pc.w2 = w2; + pc.w3 = w3; + pc.w4 = w4; + pre_calc[pre_calc_index] = pc; + + pre_calc_index += 1; + } + } + } + } +} + +template +void ROIAlignForward_cpu_kernel( + const int nthreads, + const T* bottom_data, + const T& spatial_scale, + const int channels, + const int height, + const int width, + const int pooled_height, + const int pooled_width, + const int sampling_ratio, + const T* bottom_rois, + //int roi_cols, + T* top_data) { + //AT_ASSERT(roi_cols == 4 || roi_cols == 5); + int roi_cols = 5; + + int n_rois = nthreads / channels / pooled_width / pooled_height; + // (n, c, ph, pw) is an element in the pooled output + // can be parallelized using omp + // #pragma omp parallel for num_threads(32) + for (int n = 0; n < n_rois; n++) { + int index_n = n * channels * pooled_width * pooled_height; + + // roi could have 4 or 5 columns + const T* offset_bottom_rois = bottom_rois + n * roi_cols; + int roi_batch_ind = 0; + if (roi_cols == 5) { + roi_batch_ind = offset_bottom_rois[0]; + offset_bottom_rois++; + } + + // Do not using rounding; this implementation detail is critical + T roi_start_w = offset_bottom_rois[0] * spatial_scale; + T roi_start_h = offset_bottom_rois[1] * spatial_scale; + T roi_end_w = offset_bottom_rois[2] * spatial_scale; + T roi_end_h = offset_bottom_rois[3] * spatial_scale; + // T roi_start_w = round(offset_bottom_rois[0] * spatial_scale); + // T roi_start_h = round(offset_bottom_rois[1] * spatial_scale); + // T roi_end_w = round(offset_bottom_rois[2] * spatial_scale); + // T roi_end_h = round(offset_bottom_rois[3] * spatial_scale); + + // Force malformed ROIs to be 1x1 + T roi_width = std::max(roi_end_w - roi_start_w, (T)1.); + T roi_height = std::max(roi_end_h - roi_start_h, (T)1.); + T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); + T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); + + // We use roi_bin_grid to sample the grid and mimic integral + int roi_bin_grid_h = (sampling_ratio > 0) + ? sampling_ratio + : ceil(roi_height / pooled_height); // e.g., = 2 + int roi_bin_grid_w = + (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); + + // We do average (integral) pooling inside a bin + const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4 + + // we want to precalculate indeces and weights shared by all chanels, + // this is the key point of optimiation + std::vector> pre_calc( + roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height); + pre_calc_for_bilinear_interpolate( + height, + width, + pooled_height, + pooled_width, + roi_bin_grid_h, + roi_bin_grid_w, + roi_start_h, + roi_start_w, + bin_size_h, + bin_size_w, + roi_bin_grid_h, + roi_bin_grid_w, + pre_calc); + + for (int c = 0; c < channels; c++) { + int index_n_c = index_n + c * pooled_width * pooled_height; + const T* offset_bottom_data = + bottom_data + (roi_batch_ind * channels + c) * height * width; + int pre_calc_index = 0; + + for (int ph = 0; ph < pooled_height; ph++) { + for (int pw = 0; pw < pooled_width; pw++) { + int index = index_n_c + ph * pooled_width + pw; + + T output_val = 0.; + for (int iy = 0; iy < roi_bin_grid_h; iy++) { + for (int ix = 0; ix < roi_bin_grid_w; ix++) { + PreCalc pc = pre_calc[pre_calc_index]; + output_val += pc.w1 * offset_bottom_data[pc.pos1] + + pc.w2 * offset_bottom_data[pc.pos2] + + pc.w3 * offset_bottom_data[pc.pos3] + + pc.w4 * offset_bottom_data[pc.pos4]; + + pre_calc_index += 1; + } + } + output_val /= count; + + top_data[index] = output_val; + } // for pw + } // for ph + } // for c + } // for n +} + +at::Tensor ROIAlign_forward_cpu(const at::Tensor& input, + const at::Tensor& rois, + const float spatial_scale, + const int pooled_height, + const int pooled_width, + const int sampling_ratio) { + AT_ASSERTM(!input.type().is_cuda(), "input must be a CPU tensor"); + AT_ASSERTM(!rois.type().is_cuda(), "rois must be a CPU tensor"); + + auto num_rois = rois.size(0); + auto channels = input.size(1); + auto height = input.size(2); + auto width = input.size(3); + + auto output = at::empty({num_rois, channels, pooled_height, pooled_width}, input.options()); + auto output_size = num_rois * pooled_height * pooled_width * channels; + + if (output.numel() == 0) { + return output; + } + + AT_DISPATCH_FLOATING_TYPES(input.type(), "ROIAlign_forward", [&] { + ROIAlignForward_cpu_kernel( + output_size, + input.data(), + spatial_scale, + channels, + height, + width, + pooled_height, + pooled_width, + sampling_ratio, + rois.data(), + output.data()); + }); + return output; +} diff --git a/maskrcnn_benchmark/csrc/cpu/nms_cpu.cpp b/maskrcnn_benchmark/csrc/cpu/nms_cpu.cpp new file mode 100644 index 0000000000000000000000000000000000000000..1153dea04f032c67c41bd0d2a285376a72c5a595 --- /dev/null +++ b/maskrcnn_benchmark/csrc/cpu/nms_cpu.cpp @@ -0,0 +1,75 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +#include "cpu/vision.h" + + +template +at::Tensor nms_cpu_kernel(const at::Tensor& dets, + const at::Tensor& scores, + const float threshold) { + AT_ASSERTM(!dets.type().is_cuda(), "dets must be a CPU tensor"); + AT_ASSERTM(!scores.type().is_cuda(), "scores must be a CPU tensor"); + AT_ASSERTM(dets.type() == scores.type(), "dets should have the same type as scores"); + + if (dets.numel() == 0) { + return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU)); + } + + auto x1_t = dets.select(1, 0).contiguous(); + auto y1_t = dets.select(1, 1).contiguous(); + auto x2_t = dets.select(1, 2).contiguous(); + auto y2_t = dets.select(1, 3).contiguous(); + + at::Tensor areas_t = (x2_t - x1_t + 1) * (y2_t - y1_t + 1); + + auto order_t = std::get<1>(scores.sort(0, /* descending=*/true)); + + auto ndets = dets.size(0); + at::Tensor suppressed_t = at::zeros({ndets}, dets.options().dtype(at::kByte).device(at::kCPU)); + + auto suppressed = suppressed_t.data(); + auto order = order_t.data(); + auto x1 = x1_t.data(); + auto y1 = y1_t.data(); + auto x2 = x2_t.data(); + auto y2 = y2_t.data(); + auto areas = areas_t.data(); + + for (int64_t _i = 0; _i < ndets; _i++) { + auto i = order[_i]; + if (suppressed[i] == 1) + continue; + auto ix1 = x1[i]; + auto iy1 = y1[i]; + auto ix2 = x2[i]; + auto iy2 = y2[i]; + auto iarea = areas[i]; + + for (int64_t _j = _i + 1; _j < ndets; _j++) { + auto j = order[_j]; + if (suppressed[j] == 1) + continue; + auto xx1 = std::max(ix1, x1[j]); + auto yy1 = std::max(iy1, y1[j]); + auto xx2 = std::min(ix2, x2[j]); + auto yy2 = std::min(iy2, y2[j]); + + auto w = std::max(static_cast(0), xx2 - xx1 + 1); + auto h = std::max(static_cast(0), yy2 - yy1 + 1); + auto inter = w * h; + auto ovr = inter / (iarea + areas[j] - inter); + if (ovr >= threshold) + suppressed[j] = 1; + } + } + return at::nonzero(suppressed_t == 0).squeeze(1); +} + +at::Tensor nms_cpu(const at::Tensor& dets, + const at::Tensor& scores, + const float threshold) { + at::Tensor result; + AT_DISPATCH_FLOATING_TYPES(dets.type(), "nms", [&] { + result = nms_cpu_kernel(dets, scores, threshold); + }); + return result; +} diff --git a/maskrcnn_benchmark/csrc/cpu/vision.h b/maskrcnn_benchmark/csrc/cpu/vision.h new file mode 100644 index 0000000000000000000000000000000000000000..92611253616c16efdbed66318da9930b233ae09c --- /dev/null +++ b/maskrcnn_benchmark/csrc/cpu/vision.h @@ -0,0 +1,16 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +#pragma once +#include + + +at::Tensor ROIAlign_forward_cpu(const at::Tensor& input, + const at::Tensor& rois, + const float spatial_scale, + const int pooled_height, + const int pooled_width, + const int sampling_ratio); + + +at::Tensor nms_cpu(const at::Tensor& dets, + const at::Tensor& scores, + const float threshold); diff --git a/maskrcnn_benchmark/csrc/cuda/ROIAlign_cuda.cu b/maskrcnn_benchmark/csrc/cuda/ROIAlign_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..1142fb37597141122ee63161d0abd7beac510a74 --- /dev/null +++ b/maskrcnn_benchmark/csrc/cuda/ROIAlign_cuda.cu @@ -0,0 +1,346 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +#include +#include + +#include +#include +#include + +// TODO make it in a common file +#define CUDA_1D_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \ + i += blockDim.x * gridDim.x) + + +template +__device__ T bilinear_interpolate(const T* bottom_data, + const int height, const int width, + T y, T x, + const int index /* index for debug only*/) { + + // deal with cases that inverse elements are out of feature map boundary + if (y < -1.0 || y > height || x < -1.0 || x > width) { + //empty + return 0; + } + + if (y <= 0) y = 0; + if (x <= 0) x = 0; + + int y_low = (int) y; + int x_low = (int) x; + int y_high; + int x_high; + + if (y_low >= height - 1) { + y_high = y_low = height - 1; + y = (T) y_low; + } else { + y_high = y_low + 1; + } + + if (x_low >= width - 1) { + x_high = x_low = width - 1; + x = (T) x_low; + } else { + x_high = x_low + 1; + } + + T ly = y - y_low; + T lx = x - x_low; + T hy = 1. - ly, hx = 1. - lx; + // do bilinear interpolation + T v1 = bottom_data[y_low * width + x_low]; + T v2 = bottom_data[y_low * width + x_high]; + T v3 = bottom_data[y_high * width + x_low]; + T v4 = bottom_data[y_high * width + x_high]; + T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; + + T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); + + return val; +} + +template +__global__ void RoIAlignForward(const int nthreads, const T* bottom_data, + const T spatial_scale, const int channels, + const int height, const int width, + const int pooled_height, const int pooled_width, + const int sampling_ratio, + const T* bottom_rois, T* top_data) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + // (n, c, ph, pw) is an element in the pooled output + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int c = (index / pooled_width / pooled_height) % channels; + int n = index / pooled_width / pooled_height / channels; + + const T* offset_bottom_rois = bottom_rois + n * 5; + int roi_batch_ind = offset_bottom_rois[0]; + + // Do not using rounding; this implementation detail is critical + T roi_start_w = offset_bottom_rois[1] * spatial_scale; + T roi_start_h = offset_bottom_rois[2] * spatial_scale; + T roi_end_w = offset_bottom_rois[3] * spatial_scale; + T roi_end_h = offset_bottom_rois[4] * spatial_scale; + // T roi_start_w = round(offset_bottom_rois[1] * spatial_scale); + // T roi_start_h = round(offset_bottom_rois[2] * spatial_scale); + // T roi_end_w = round(offset_bottom_rois[3] * spatial_scale); + // T roi_end_h = round(offset_bottom_rois[4] * spatial_scale); + + // Force malformed ROIs to be 1x1 + T roi_width = max(roi_end_w - roi_start_w, (T)1.); + T roi_height = max(roi_end_h - roi_start_h, (T)1.); + T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); + T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); + + const T* offset_bottom_data = bottom_data + (roi_batch_ind * channels + c) * height * width; + + // We use roi_bin_grid to sample the grid and mimic integral + int roi_bin_grid_h = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_height / pooled_height); // e.g., = 2 + int roi_bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); + + // We do average (integral) pooling inside a bin + const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4 + + T output_val = 0.; + for (int iy = 0; iy < roi_bin_grid_h; iy ++) // e.g., iy = 0, 1 + { + const T y = roi_start_h + ph * bin_size_h + static_cast(iy + .5f) * bin_size_h / static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 + for (int ix = 0; ix < roi_bin_grid_w; ix ++) + { + const T x = roi_start_w + pw * bin_size_w + static_cast(ix + .5f) * bin_size_w / static_cast(roi_bin_grid_w); + + T val = bilinear_interpolate(offset_bottom_data, height, width, y, x, index); + output_val += val; + } + } + output_val /= count; + + top_data[index] = output_val; + } +} + + +template +__device__ void bilinear_interpolate_gradient( + const int height, const int width, + T y, T x, + T & w1, T & w2, T & w3, T & w4, + int & x_low, int & x_high, int & y_low, int & y_high, + const int index /* index for debug only*/) { + + // deal with cases that inverse elements are out of feature map boundary + if (y < -1.0 || y > height || x < -1.0 || x > width) { + //empty + w1 = w2 = w3 = w4 = 0.; + x_low = x_high = y_low = y_high = -1; + return; + } + + if (y <= 0) y = 0; + if (x <= 0) x = 0; + + y_low = (int) y; + x_low = (int) x; + + if (y_low >= height - 1) { + y_high = y_low = height - 1; + y = (T) y_low; + } else { + y_high = y_low + 1; + } + + if (x_low >= width - 1) { + x_high = x_low = width - 1; + x = (T) x_low; + } else { + x_high = x_low + 1; + } + + T ly = y - y_low; + T lx = x - x_low; + T hy = 1. - ly, hx = 1. - lx; + + // reference in forward + // T v1 = bottom_data[y_low * width + x_low]; + // T v2 = bottom_data[y_low * width + x_high]; + // T v3 = bottom_data[y_high * width + x_low]; + // T v4 = bottom_data[y_high * width + x_high]; + // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); + + w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; + + return; +} + +template +__global__ void RoIAlignBackwardFeature(const int nthreads, const T* top_diff, + const int num_rois, const T spatial_scale, + const int channels, const int height, const int width, + const int pooled_height, const int pooled_width, + const int sampling_ratio, + T* bottom_diff, + const T* bottom_rois) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + // (n, c, ph, pw) is an element in the pooled output + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int c = (index / pooled_width / pooled_height) % channels; + int n = index / pooled_width / pooled_height / channels; + + const T* offset_bottom_rois = bottom_rois + n * 5; + int roi_batch_ind = offset_bottom_rois[0]; + + // Do not using rounding; this implementation detail is critical + T roi_start_w = offset_bottom_rois[1] * spatial_scale; + T roi_start_h = offset_bottom_rois[2] * spatial_scale; + T roi_end_w = offset_bottom_rois[3] * spatial_scale; + T roi_end_h = offset_bottom_rois[4] * spatial_scale; + // T roi_start_w = round(offset_bottom_rois[1] * spatial_scale); + // T roi_start_h = round(offset_bottom_rois[2] * spatial_scale); + // T roi_end_w = round(offset_bottom_rois[3] * spatial_scale); + // T roi_end_h = round(offset_bottom_rois[4] * spatial_scale); + + // Force malformed ROIs to be 1x1 + T roi_width = max(roi_end_w - roi_start_w, (T)1.); + T roi_height = max(roi_end_h - roi_start_h, (T)1.); + T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); + T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); + + T* offset_bottom_diff = bottom_diff + (roi_batch_ind * channels + c) * height * width; + + int top_offset = (n * channels + c) * pooled_height * pooled_width; + const T* offset_top_diff = top_diff + top_offset; + const T top_diff_this_bin = offset_top_diff[ph * pooled_width + pw]; + + // We use roi_bin_grid to sample the grid and mimic integral + int roi_bin_grid_h = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_height / pooled_height); // e.g., = 2 + int roi_bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); + + // We do average (integral) pooling inside a bin + const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4 + + for (int iy = 0; iy < roi_bin_grid_h; iy ++) // e.g., iy = 0, 1 + { + const T y = roi_start_h + ph * bin_size_h + static_cast(iy + .5f) * bin_size_h / static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 + for (int ix = 0; ix < roi_bin_grid_w; ix ++) + { + const T x = roi_start_w + pw * bin_size_w + static_cast(ix + .5f) * bin_size_w / static_cast(roi_bin_grid_w); + + T w1, w2, w3, w4; + int x_low, x_high, y_low, y_high; + + bilinear_interpolate_gradient(height, width, y, x, + w1, w2, w3, w4, + x_low, x_high, y_low, y_high, + index); + + T g1 = top_diff_this_bin * w1 / count; + T g2 = top_diff_this_bin * w2 / count; + T g3 = top_diff_this_bin * w3 / count; + T g4 = top_diff_this_bin * w4 / count; + + if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) + { + atomicAdd(offset_bottom_diff + y_low * width + x_low, static_cast(g1)); + atomicAdd(offset_bottom_diff + y_low * width + x_high, static_cast(g2)); + atomicAdd(offset_bottom_diff + y_high * width + x_low, static_cast(g3)); + atomicAdd(offset_bottom_diff + y_high * width + x_high, static_cast(g4)); + } // if + } // ix + } // iy + } // CUDA_1D_KERNEL_LOOP +} // RoIAlignBackward + + +at::Tensor ROIAlign_forward_cuda(const at::Tensor& input, + const at::Tensor& rois, + const float spatial_scale, + const int pooled_height, + const int pooled_width, + const int sampling_ratio) { + AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor"); + AT_ASSERTM(rois.type().is_cuda(), "rois must be a CUDA tensor"); + + auto num_rois = rois.size(0); + auto channels = input.size(1); + auto height = input.size(2); + auto width = input.size(3); + + auto output = at::empty({num_rois, channels, pooled_height, pooled_width}, input.options()); + auto output_size = num_rois * pooled_height * pooled_width * channels; + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + dim3 grid(std::min(THCCeilDiv((long)output_size, 512L), 4096L)); + dim3 block(512); + + if (output.numel() == 0) { + THCudaCheck(cudaGetLastError()); + return output; + } + + AT_DISPATCH_FLOATING_TYPES(input.type(), "ROIAlign_forward", [&] { + RoIAlignForward<<>>( + output_size, + input.contiguous().data(), + spatial_scale, + channels, + height, + width, + pooled_height, + pooled_width, + sampling_ratio, + rois.contiguous().data(), + output.data()); + }); + THCudaCheck(cudaGetLastError()); + return output; +} + +// TODO remove the dependency on input and use instead its sizes -> save memory +at::Tensor ROIAlign_backward_cuda(const at::Tensor& grad, + const at::Tensor& rois, + const float spatial_scale, + const int pooled_height, + const int pooled_width, + const int batch_size, + const int channels, + const int height, + const int width, + const int sampling_ratio) { + AT_ASSERTM(grad.type().is_cuda(), "grad must be a CUDA tensor"); + AT_ASSERTM(rois.type().is_cuda(), "rois must be a CUDA tensor"); + + auto num_rois = rois.size(0); + auto grad_input = at::zeros({batch_size, channels, height, width}, grad.options()); + + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + dim3 grid(std::min(THCCeilDiv((long)grad.numel(), 512L), 4096L)); + dim3 block(512); + + // handle possibly empty gradients + if (grad.numel() == 0) { + THCudaCheck(cudaGetLastError()); + return grad_input; + } + + AT_DISPATCH_FLOATING_TYPES(grad.type(), "ROIAlign_backward", [&] { + RoIAlignBackwardFeature<<>>( + grad.numel(), + grad.contiguous().data(), + num_rois, + spatial_scale, + channels, + height, + width, + pooled_height, + pooled_width, + sampling_ratio, + grad_input.data(), + rois.contiguous().data()); + }); + THCudaCheck(cudaGetLastError()); + return grad_input; +} diff --git a/maskrcnn_benchmark/csrc/cuda/ROIPool_cuda.cu b/maskrcnn_benchmark/csrc/cuda/ROIPool_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..8f072ffc2bd6de310f0d92c8c513dd9cfcc80dbc --- /dev/null +++ b/maskrcnn_benchmark/csrc/cuda/ROIPool_cuda.cu @@ -0,0 +1,202 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +#include +#include + +#include +#include +#include + + +// TODO make it in a common file +#define CUDA_1D_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \ + i += blockDim.x * gridDim.x) + + +template +__global__ void RoIPoolFForward(const int nthreads, const T* bottom_data, + const T spatial_scale, const int channels, const int height, + const int width, const int pooled_height, const int pooled_width, + const T* bottom_rois, T* top_data, int* argmax_data) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + // (n, c, ph, pw) is an element in the pooled output + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int c = (index / pooled_width / pooled_height) % channels; + int n = index / pooled_width / pooled_height / channels; + + const T* offset_bottom_rois = bottom_rois + n * 5; + int roi_batch_ind = offset_bottom_rois[0]; + int roi_start_w = round(offset_bottom_rois[1] * spatial_scale); + int roi_start_h = round(offset_bottom_rois[2] * spatial_scale); + int roi_end_w = round(offset_bottom_rois[3] * spatial_scale); + int roi_end_h = round(offset_bottom_rois[4] * spatial_scale); + + // Force malformed ROIs to be 1x1 + int roi_width = max(roi_end_w - roi_start_w + 1, 1); + int roi_height = max(roi_end_h - roi_start_h + 1, 1); + T bin_size_h = static_cast(roi_height) + / static_cast(pooled_height); + T bin_size_w = static_cast(roi_width) + / static_cast(pooled_width); + + int hstart = static_cast(floor(static_cast(ph) + * bin_size_h)); + int wstart = static_cast(floor(static_cast(pw) + * bin_size_w)); + int hend = static_cast(ceil(static_cast(ph + 1) + * bin_size_h)); + int wend = static_cast(ceil(static_cast(pw + 1) + * bin_size_w)); + + // Add roi offsets and clip to input boundaries + hstart = min(max(hstart + roi_start_h, 0), height); + hend = min(max(hend + roi_start_h, 0), height); + wstart = min(max(wstart + roi_start_w, 0), width); + wend = min(max(wend + roi_start_w, 0), width); + bool is_empty = (hend <= hstart) || (wend <= wstart); + + // Define an empty pooling region to be zero + T maxval = is_empty ? 0 : -FLT_MAX; + // If nothing is pooled, argmax = -1 causes nothing to be backprop'd + int maxidx = -1; + const T* offset_bottom_data = + bottom_data + (roi_batch_ind * channels + c) * height * width; + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + int bottom_index = h * width + w; + if (offset_bottom_data[bottom_index] > maxval) { + maxval = offset_bottom_data[bottom_index]; + maxidx = bottom_index; + } + } + } + top_data[index] = maxval; + argmax_data[index] = maxidx; + } +} + +template +__global__ void RoIPoolFBackward(const int nthreads, const T* top_diff, + const int* argmax_data, const int num_rois, const T spatial_scale, + const int channels, const int height, const int width, + const int pooled_height, const int pooled_width, T* bottom_diff, + const T* bottom_rois) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + // (n, c, ph, pw) is an element in the pooled output + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int c = (index / pooled_width / pooled_height) % channels; + int n = index / pooled_width / pooled_height / channels; + + const T* offset_bottom_rois = bottom_rois + n * 5; + int roi_batch_ind = offset_bottom_rois[0]; + int bottom_offset = (roi_batch_ind * channels + c) * height * width; + int top_offset = (n * channels + c) * pooled_height * pooled_width; + const T* offset_top_diff = top_diff + top_offset; + T* offset_bottom_diff = bottom_diff + bottom_offset; + const int* offset_argmax_data = argmax_data + top_offset; + + int argmax = offset_argmax_data[ph * pooled_width + pw]; + if (argmax != -1) { + atomicAdd( + offset_bottom_diff + argmax, + static_cast(offset_top_diff[ph * pooled_width + pw])); + + } + } +} + +std::tuple ROIPool_forward_cuda(const at::Tensor& input, + const at::Tensor& rois, + const float spatial_scale, + const int pooled_height, + const int pooled_width) { + AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor"); + AT_ASSERTM(rois.type().is_cuda(), "rois must be a CUDA tensor"); + + auto num_rois = rois.size(0); + auto channels = input.size(1); + auto height = input.size(2); + auto width = input.size(3); + + auto output = at::empty({num_rois, channels, pooled_height, pooled_width}, input.options()); + auto output_size = num_rois * pooled_height * pooled_width * channels; + auto argmax = at::zeros({num_rois, channels, pooled_height, pooled_width}, input.options().dtype(at::kInt)); + + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + dim3 grid(std::min(THCCeilDiv((long)output_size, 512L), 4096L)); + dim3 block(512); + + if (output.numel() == 0) { + THCudaCheck(cudaGetLastError()); + return std::make_tuple(output, argmax); + } + + AT_DISPATCH_FLOATING_TYPES(input.type(), "ROIPool_forward", [&] { + RoIPoolFForward<<>>( + output_size, + input.contiguous().data(), + spatial_scale, + channels, + height, + width, + pooled_height, + pooled_width, + rois.contiguous().data(), + output.data(), + argmax.data()); + }); + THCudaCheck(cudaGetLastError()); + return std::make_tuple(output, argmax); +} + +// TODO remove the dependency on input and use instead its sizes -> save memory +at::Tensor ROIPool_backward_cuda(const at::Tensor& grad, + const at::Tensor& input, + const at::Tensor& rois, + const at::Tensor& argmax, + const float spatial_scale, + const int pooled_height, + const int pooled_width, + const int batch_size, + const int channels, + const int height, + const int width) { + AT_ASSERTM(grad.type().is_cuda(), "grad must be a CUDA tensor"); + AT_ASSERTM(rois.type().is_cuda(), "rois must be a CUDA tensor"); + // TODO add more checks + + auto num_rois = rois.size(0); + auto grad_input = at::zeros({batch_size, channels, height, width}, grad.options()); + + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + dim3 grid(std::min(THCCeilDiv((long)grad.numel(), 512L), 4096L)); + dim3 block(512); + + // handle possibly empty gradients + if (grad.numel() == 0) { + THCudaCheck(cudaGetLastError()); + return grad_input; + } + + AT_DISPATCH_FLOATING_TYPES(grad.type(), "ROIPool_backward", [&] { + RoIPoolFBackward<<>>( + grad.numel(), + grad.contiguous().data(), + argmax.data(), + num_rois, + spatial_scale, + channels, + height, + width, + pooled_height, + pooled_width, + grad_input.data(), + rois.contiguous().data()); + }); + THCudaCheck(cudaGetLastError()); + return grad_input; +} diff --git a/maskrcnn_benchmark/csrc/cuda/SigmoidFocalLoss_cuda.cu b/maskrcnn_benchmark/csrc/cuda/SigmoidFocalLoss_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..7d40767bbb690eb8e55397bca83af636c7e0531c --- /dev/null +++ b/maskrcnn_benchmark/csrc/cuda/SigmoidFocalLoss_cuda.cu @@ -0,0 +1,188 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This file is modified from https://github.com/pytorch/pytorch/blob/master/modules/detectron/sigmoid_focal_loss_op.cu +// Cheng-Yang Fu +// cyfu@cs.unc.edu +#include +#include + +#include +#include +#include + +#include + +// TODO make it in a common file +#define CUDA_1D_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \ + i += blockDim.x * gridDim.x) + + +template +__global__ void SigmoidFocalLossForward(const int nthreads, + const T* logits, + const int* targets, + const int num_classes, + const float gamma, + const float alpha, + const int num, + T* losses) { + CUDA_1D_KERNEL_LOOP(i, nthreads) { + + int n = i / num_classes; + int d = i % num_classes; // current class[0~79]; + int t = targets[n]; // target class [1~80]; + + // Decide it is positive or negative case. + T c1 = (t == (d+1)); + T c2 = (t>=0 & t != (d+1)); + + T zn = (1.0 - alpha); + T zp = (alpha); + + // p = 1. / 1. + expf(-x); p = sigmoid(x) + T p = 1. / (1. + expf(-logits[i])); + + // (1-p)**gamma * log(p) where + T term1 = powf((1. - p), gamma) * logf(max(p, FLT_MIN)); + + // p**gamma * log(1-p) + T term2 = powf(p, gamma) * + (-1. * logits[i] * (logits[i] >= 0) - + logf(1. + expf(logits[i] - 2. * logits[i] * (logits[i] >= 0)))); + + losses[i] = 0.0; + losses[i] += -c1 * term1 * zp; + losses[i] += -c2 * term2 * zn; + + } // CUDA_1D_KERNEL_LOOP +} // SigmoidFocalLossForward + + +template +__global__ void SigmoidFocalLossBackward(const int nthreads, + const T* logits, + const int* targets, + const T* d_losses, + const int num_classes, + const float gamma, + const float alpha, + const int num, + T* d_logits) { + CUDA_1D_KERNEL_LOOP(i, nthreads) { + + int n = i / num_classes; + int d = i % num_classes; // current class[0~79]; + int t = targets[n]; // target class [1~80], 0 is background; + + // Decide it is positive or negative case. + T c1 = (t == (d+1)); + T c2 = (t>=0 & t != (d+1)); + + T zn = (1.0 - alpha); + T zp = (alpha); + // p = 1. / 1. + expf(-x); p = sigmoid(x) + T p = 1. / (1. + expf(-logits[i])); + + // (1-p)**g * (1 - p - g*p*log(p) + T term1 = powf((1. - p), gamma) * + (1. - p - (p * gamma * logf(max(p, FLT_MIN)))); + + // (p**g) * (g*(1-p)*log(1-p) - p) + T term2 = powf(p, gamma) * + ((-1. * logits[i] * (logits[i] >= 0) - + logf(1. + expf(logits[i] - 2. * logits[i] * (logits[i] >= 0)))) * + (1. - p) * gamma - p); + d_logits[i] = 0.0; + d_logits[i] += -c1 * term1 * zp; + d_logits[i] += -c2 * term2 * zn; + d_logits[i] = d_logits[i] * d_losses[i]; + + } // CUDA_1D_KERNEL_LOOP +} // SigmoidFocalLossBackward + + +at::Tensor SigmoidFocalLoss_forward_cuda( + const at::Tensor& logits, + const at::Tensor& targets, + const int num_classes, + const float gamma, + const float alpha) { + AT_ASSERTM(logits.type().is_cuda(), "logits must be a CUDA tensor"); + AT_ASSERTM(targets.type().is_cuda(), "targets must be a CUDA tensor"); + AT_ASSERTM(logits.dim() == 2, "logits should be NxClass"); + + const int num_samples = logits.size(0); + + auto losses = at::empty({num_samples, logits.size(1)}, logits.options()); + auto losses_size = num_samples * logits.size(1); + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + dim3 grid(std::min(THCCeilDiv(losses_size, 512L), 4096L)); + dim3 block(512); + + if (losses.numel() == 0) { + THCudaCheck(cudaGetLastError()); + return losses; + } + + AT_DISPATCH_FLOATING_TYPES(logits.type(), "SigmoidFocalLoss_forward", [&] { + SigmoidFocalLossForward<<>>( + losses_size, + logits.contiguous().data(), + targets.contiguous().data(), + num_classes, + gamma, + alpha, + num_samples, + losses.data()); + }); + THCudaCheck(cudaGetLastError()); + return losses; +} + + +at::Tensor SigmoidFocalLoss_backward_cuda( + const at::Tensor& logits, + const at::Tensor& targets, + const at::Tensor& d_losses, + const int num_classes, + const float gamma, + const float alpha) { + AT_ASSERTM(logits.type().is_cuda(), "logits must be a CUDA tensor"); + AT_ASSERTM(targets.type().is_cuda(), "targets must be a CUDA tensor"); + AT_ASSERTM(d_losses.type().is_cuda(), "d_losses must be a CUDA tensor"); + + AT_ASSERTM(logits.dim() == 2, "logits should be NxClass"); + + const int num_samples = logits.size(0); + AT_ASSERTM(logits.size(1) == num_classes, "logits.size(1) should be num_classes"); + + auto d_logits = at::zeros({num_samples, num_classes}, logits.options()); + auto d_logits_size = num_samples * logits.size(1); + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + dim3 grid(std::min(THCCeilDiv(d_logits_size, 512L), 4096L)); + dim3 block(512); + + if (d_logits.numel() == 0) { + THCudaCheck(cudaGetLastError()); + return d_logits; + } + + AT_DISPATCH_FLOATING_TYPES(logits.type(), "SigmoidFocalLoss_backward", [&] { + SigmoidFocalLossBackward<<>>( + d_logits_size, + logits.contiguous().data(), + targets.contiguous().data(), + d_losses.contiguous().data(), + num_classes, + gamma, + alpha, + num_samples, + d_logits.data()); + }); + + THCudaCheck(cudaGetLastError()); + return d_logits; +} + diff --git a/maskrcnn_benchmark/csrc/cuda/nms.cu b/maskrcnn_benchmark/csrc/cuda/nms.cu new file mode 100644 index 0000000000000000000000000000000000000000..833d8523a5809d99a1078a144a384c864a9d8df9 --- /dev/null +++ b/maskrcnn_benchmark/csrc/cuda/nms.cu @@ -0,0 +1,131 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +#include +#include + +#include +#include + +#include +#include + +int const threadsPerBlock = sizeof(unsigned long long) * 8; + +__device__ inline float devIoU(float const * const a, float const * const b) { + float left = max(a[0], b[0]), right = min(a[2], b[2]); + float top = max(a[1], b[1]), bottom = min(a[3], b[3]); + float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f); + float interS = width * height; + float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1); + float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1); + return interS / (Sa + Sb - interS); +} + +__global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh, + const float *dev_boxes, unsigned long long *dev_mask) { + const int row_start = blockIdx.y; + const int col_start = blockIdx.x; + + // if (row_start > col_start) return; + + const int row_size = + min(n_boxes - row_start * threadsPerBlock, threadsPerBlock); + const int col_size = + min(n_boxes - col_start * threadsPerBlock, threadsPerBlock); + + __shared__ float block_boxes[threadsPerBlock * 5]; + if (threadIdx.x < col_size) { + block_boxes[threadIdx.x * 5 + 0] = + dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0]; + block_boxes[threadIdx.x * 5 + 1] = + dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1]; + block_boxes[threadIdx.x * 5 + 2] = + dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2]; + block_boxes[threadIdx.x * 5 + 3] = + dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3]; + block_boxes[threadIdx.x * 5 + 4] = + dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4]; + } + __syncthreads(); + + if (threadIdx.x < row_size) { + const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x; + const float *cur_box = dev_boxes + cur_box_idx * 5; + int i = 0; + unsigned long long t = 0; + int start = 0; + if (row_start == col_start) { + start = threadIdx.x + 1; + } + for (i = start; i < col_size; i++) { + if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) { + t |= 1ULL << i; + } + } + const int col_blocks = THCCeilDiv(n_boxes, threadsPerBlock); + dev_mask[cur_box_idx * col_blocks + col_start] = t; + } +} + +// boxes is a N x 5 tensor +at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh) { + using scalar_t = float; + AT_ASSERTM(boxes.type().is_cuda(), "boxes must be a CUDA tensor"); + auto scores = boxes.select(1, 4); + auto order_t = std::get<1>(scores.sort(0, /* descending=*/true)); + auto boxes_sorted = boxes.index_select(0, order_t); + + int boxes_num = boxes.size(0); + + const int col_blocks = THCCeilDiv(boxes_num, threadsPerBlock); + + scalar_t* boxes_dev = boxes_sorted.data(); + + THCState *state = at::globalContext().lazyInitCUDA(); // TODO replace with getTHCState + + unsigned long long* mask_dev = NULL; + //THCudaCheck(THCudaMalloc(state, (void**) &mask_dev, + // boxes_num * col_blocks * sizeof(unsigned long long))); + + mask_dev = (unsigned long long*) THCudaMalloc(state, boxes_num * col_blocks * sizeof(unsigned long long)); + + dim3 blocks(THCCeilDiv(boxes_num, threadsPerBlock), + THCCeilDiv(boxes_num, threadsPerBlock)); + dim3 threads(threadsPerBlock); + nms_kernel<<>>(boxes_num, + nms_overlap_thresh, + boxes_dev, + mask_dev); + + std::vector mask_host(boxes_num * col_blocks); + THCudaCheck(cudaMemcpy(&mask_host[0], + mask_dev, + sizeof(unsigned long long) * boxes_num * col_blocks, + cudaMemcpyDeviceToHost)); + + std::vector remv(col_blocks); + memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks); + + at::Tensor keep = at::empty({boxes_num}, boxes.options().dtype(at::kLong).device(at::kCPU)); + int64_t* keep_out = keep.data(); + + int num_to_keep = 0; + for (int i = 0; i < boxes_num; i++) { + int nblock = i / threadsPerBlock; + int inblock = i % threadsPerBlock; + + if (!(remv[nblock] & (1ULL << inblock))) { + keep_out[num_to_keep++] = i; + unsigned long long *p = &mask_host[0] + i * col_blocks; + for (int j = nblock; j < col_blocks; j++) { + remv[j] |= p[j]; + } + } + } + + THCudaFree(state, mask_dev); + // TODO improve this part + return std::get<0>(order_t.index({ + keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep).to( + order_t.device(), keep.scalar_type()) + }).sort(0, false)); +} diff --git a/maskrcnn_benchmark/csrc/cuda/vision.h b/maskrcnn_benchmark/csrc/cuda/vision.h new file mode 100644 index 0000000000000000000000000000000000000000..6d9f8871f7dff884da9b56200c77d84541c2535e --- /dev/null +++ b/maskrcnn_benchmark/csrc/cuda/vision.h @@ -0,0 +1,63 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +#pragma once +#include + + +at::Tensor SigmoidFocalLoss_forward_cuda( + const at::Tensor& logits, + const at::Tensor& targets, + const int num_classes, + const float gamma, + const float alpha); + +at::Tensor SigmoidFocalLoss_backward_cuda( + const at::Tensor& logits, + const at::Tensor& targets, + const at::Tensor& d_losses, + const int num_classes, + const float gamma, + const float alpha); + +at::Tensor ROIAlign_forward_cuda(const at::Tensor& input, + const at::Tensor& rois, + const float spatial_scale, + const int pooled_height, + const int pooled_width, + const int sampling_ratio); + +at::Tensor ROIAlign_backward_cuda(const at::Tensor& grad, + const at::Tensor& rois, + const float spatial_scale, + const int pooled_height, + const int pooled_width, + const int batch_size, + const int channels, + const int height, + const int width, + const int sampling_ratio); + + +std::tuple ROIPool_forward_cuda(const at::Tensor& input, + const at::Tensor& rois, + const float spatial_scale, + const int pooled_height, + const int pooled_width); + +at::Tensor ROIPool_backward_cuda(const at::Tensor& grad, + const at::Tensor& input, + const at::Tensor& rois, + const at::Tensor& argmax, + const float spatial_scale, + const int pooled_height, + const int pooled_width, + const int batch_size, + const int channels, + const int height, + const int width); + +at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh); + + +at::Tensor compute_flow_cuda(const at::Tensor& boxes, + const int height, + const int width); diff --git a/maskrcnn_benchmark/csrc/nms.h b/maskrcnn_benchmark/csrc/nms.h new file mode 100644 index 0000000000000000000000000000000000000000..312fed4a7cb7c1bc6c2345b5e5d678cc6c1a7141 --- /dev/null +++ b/maskrcnn_benchmark/csrc/nms.h @@ -0,0 +1,28 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +#pragma once +#include "cpu/vision.h" + +#ifdef WITH_CUDA +#include "cuda/vision.h" +#endif + + +at::Tensor nms(const at::Tensor& dets, + const at::Tensor& scores, + const float threshold) { + + if (dets.type().is_cuda()) { +#ifdef WITH_CUDA + // TODO raise error if not compiled with CUDA + if (dets.numel() == 0) + return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU)); + auto b = at::cat({dets, scores.unsqueeze(1)}, 1); + return nms_cuda(b, threshold); +#else + AT_ERROR("Not compiled with GPU support"); +#endif + } + + at::Tensor result = nms_cpu(dets, scores, threshold); + return result; +} diff --git a/maskrcnn_benchmark/csrc/vision.cpp b/maskrcnn_benchmark/csrc/vision.cpp new file mode 100644 index 0000000000000000000000000000000000000000..8234f43b1de8fd43fc9ec18c10909fa723652df1 --- /dev/null +++ b/maskrcnn_benchmark/csrc/vision.cpp @@ -0,0 +1,15 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +#include "nms.h" +#include "ROIAlign.h" +#include "ROIPool.h" +#include "SigmoidFocalLoss.h" + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def("nms", &nms, "non-maximum suppression"); + m.def("roi_align_forward", &ROIAlign_forward, "ROIAlign_forward"); + m.def("roi_align_backward", &ROIAlign_backward, "ROIAlign_backward"); + m.def("roi_pool_forward", &ROIPool_forward, "ROIPool_forward"); + m.def("roi_pool_backward", &ROIPool_backward, "ROIPool_backward"); + m.def("sigmoid_focalloss_forward", &SigmoidFocalLoss_forward, "SigmoidFocalLoss_forward"); + m.def("sigmoid_focalloss_backward", &SigmoidFocalLoss_backward, "SigmoidFocalLoss_backward"); +} diff --git a/maskrcnn_benchmark/data/README.md b/maskrcnn_benchmark/data/README.md new file mode 100644 index 0000000000000000000000000000000000000000..8ae85e0567cbe71ef1f1df4137cbf549240065d2 --- /dev/null +++ b/maskrcnn_benchmark/data/README.md @@ -0,0 +1,90 @@ +# Setting Up Datasets +This file describes how to perform training on other datasets. + +Only Pascal VOC dataset can be loaded from its original format and be outputted to Pascal style results currently. + +We expect the annotations from other datasets be converted to COCO json format, and +the output will be in COCO-style. (i.e. AP, AP50, AP75, APs, APm, APl for bbox and segm) + +## Creating Symlinks for PASCAL VOC + +We assume that your symlinked `datasets/voc/VOC` directory has the following structure: + +``` +VOC +|_ JPEGImages +| |_ .jpg +| |_ ... +| |_ .jpg +|_ Annotations +| |_ pascal_train.json (optional) +| |_ pascal_val.json (optional) +| |_ pascal_test.json (optional) +| |_ .xml +| |_ ... +| |_ .xml +|_ VOCdevkit +``` + +Create symlinks for `voc/VOC`: + +``` +cd ~/github/maskrcnn-benchmark +mkdir -p datasets/voc/VOC +ln -s /path/to/VOC /datasets/voc/VOC +``` +Example configuration files for PASCAL VOC could be found [here](https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/configs/pascal_voc/). + +### PASCAL VOC Annotations in COCO Format +To output COCO-style evaluation result, PASCAL VOC annotations in COCO json format is required and could be downloaded from [here](https://storage.googleapis.com/coco-dataset/external/PASCAL_VOC.zip) +via http://cocodataset.org/#external. + +## Creating Symlinks for Cityscapes: + +We assume that your symlinked `datasets/cityscapes` directory has the following structure: + +``` +cityscapes +|_ images +| |_ .jpg +| |_ ... +| |_ .jpg +|_ annotations +| |_ instanceonly_gtFile_train.json +| |_ ... +|_ raw + |_ gtFine + |_ ... + |_ README.md +``` + +Create symlinks for `cityscapes`: + +``` +cd ~/github/maskrcnn-benchmark +mkdir -p datasets/cityscapes +ln -s /path/to/cityscapes datasets/data/cityscapes +``` + +### Steps to convert Cityscapes Annotations to COCO Format +1. Download gtFine_trainvaltest.zip from https://www.cityscapes-dataset.com/downloads/ (login required) +2. Extract it to /path/to/gtFine_trainvaltest +``` +cityscapes +|_ gtFine_trainvaltest.zip +|_ gtFine_trainvaltest + |_ gtFine +``` +3. Run the below commands to convert the annotations + +``` +cd ~/github +git clone https://github.com/mcordts/cityscapesScripts.git +cd cityscapesScripts +cp ~/github/maskrcnn-benchmark/tools/cityscapes/instances2dict_with_polygons.py cityscapesscripts/evaluation +python setup.py install +cd ~/github/maskrcnn-benchmark +python tools/cityscapes/convert_cityscapes_to_coco.py --datadir /path/to/cityscapes --outdir /path/to/cityscapes/annotations +``` + +Example configuration files for Cityscapes could be found [here](https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/configs/cityscapes/). diff --git a/maskrcnn_benchmark/data/__init__.py b/maskrcnn_benchmark/data/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..2ba1e52473f97615cc41f82aef279fff4d194527 --- /dev/null +++ b/maskrcnn_benchmark/data/__init__.py @@ -0,0 +1,2 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +from .build import make_data_loader diff --git a/maskrcnn_benchmark/data/build.py b/maskrcnn_benchmark/data/build.py new file mode 100644 index 0000000000000000000000000000000000000000..330b71a78a332d9d10e1e0e25ce36a67649d5d51 --- /dev/null +++ b/maskrcnn_benchmark/data/build.py @@ -0,0 +1,195 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import bisect +import copy +import logging + +import torch.utils.data +from maskrcnn_benchmark.utils.comm import get_world_size +from maskrcnn_benchmark.utils.imports import import_file + +from . import datasets as D +from . import samplers + +from .collate_batch import BatchCollator +from .transforms import build_transforms + + +def build_dataset(cfg, dataset_list, transforms, dataset_catalog, is_train=True): + """ + Arguments: + cfg: config + dataset_list (list[str]): Contains the names of the datasets, i.e., + coco_2014_trian, coco_2014_val, etc + transforms (callable): transforms to apply to each (image, target) sample + dataset_catalog (DatasetCatalog): contains the information on how to + construct a dataset. + is_train (bool): whether to setup the dataset for training or testing + """ + if not isinstance(dataset_list, (list, tuple)): + raise RuntimeError( + "dataset_list should be a list of strings, got {}".format(dataset_list) + ) + datasets = [] + for dataset_name in dataset_list: + data = dataset_catalog.get(dataset_name) + factory = getattr(D, data["factory"]) + args = data["args"] + # for COCODataset, we want to remove images without annotations + # during training + # if data["factory"] == "COCODataset": + if "COCO" in data["factory"]: + args["remove_images_without_annotations"] = is_train + if data["factory"] == "PascalVOCDataset": + args["use_difficult"] = not is_train + args["transforms"] = transforms + # make dataset from factory + dataset = factory(**args) + if hasattr(dataset, 'density_categories'): + dataset.density_categories = cfg.MODEL.DENSITY_HEAD.NUM_CLASSES + if hasattr(dataset, 'density_map_stride'): + dataset.density_map_stride = cfg.MODEL.DENSITY_HEAD.FPN_LEVEL_STRIDE + if hasattr(dataset, 'density_min_sigma'): + min_sigmas = { + 1: 1.0, + 2: 0.5, + 3: 0.333, + } + min_sigma = min_sigmas[cfg.MODEL.DENSITY_HEAD.FPN_LEVEL] + dataset.density_min_sigma = min_sigma + print('using density_min_sigma: {}'.format(min_sigma)) + datasets.append(dataset) + + # for testing, return a list of datasets + if not is_train: + return datasets + + # for training, concatenate all datasets into a single one + dataset = datasets[0] + if len(datasets) > 1: + dataset = D.ConcatDataset(datasets) + + return [dataset] + + +def make_data_sampler(dataset, shuffle, distributed): + if distributed: + return samplers.DistributedSampler(dataset, shuffle=shuffle) + if shuffle: + sampler = torch.utils.data.sampler.RandomSampler(dataset) + else: + sampler = torch.utils.data.sampler.SequentialSampler(dataset) + return sampler + + +def _quantize(x, bins): + bins = copy.copy(bins) + bins = sorted(bins) + quantized = list(map(lambda y: bisect.bisect_right(bins, y), x)) + return quantized + + +def _compute_aspect_ratios(dataset): + aspect_ratios = [] + for i in range(len(dataset)): + img_info = dataset.get_img_info(i) + aspect_ratio = float(img_info["height"]) / float(img_info["width"]) + aspect_ratios.append(aspect_ratio) + return aspect_ratios + + +def make_batch_data_sampler( + dataset, sampler, aspect_grouping, images_per_batch, num_iters=None, start_iter=0 +): + if aspect_grouping: + if not isinstance(aspect_grouping, (list, tuple)): + aspect_grouping = [aspect_grouping] + aspect_ratios = _compute_aspect_ratios(dataset) + group_ids = _quantize(aspect_ratios, aspect_grouping) + batch_sampler = samplers.GroupedBatchSampler( + sampler, group_ids, images_per_batch, drop_uneven=False + ) + else: + batch_sampler = torch.utils.data.sampler.BatchSampler( + sampler, images_per_batch, drop_last=False + ) + if num_iters is not None: + batch_sampler = samplers.IterationBasedBatchSampler( + batch_sampler, num_iters, start_iter + ) + return batch_sampler + + +def make_data_loader(cfg, is_train=True, is_distributed=False, start_iter=0, datasets=None, num_iters=None): + num_gpus = get_world_size() + if is_train: + images_per_batch = cfg.SOLVER.IMS_PER_BATCH + assert ( + images_per_batch % num_gpus == 0 + ), "SOLVER.IMS_PER_BATCH ({}) must be divisible by the number " + "of GPUs ({}) used.".format(images_per_batch, num_gpus) + images_per_gpu = images_per_batch // num_gpus + shuffle = True + _num_iters = cfg.SOLVER.MAX_ITER + else: + images_per_batch = cfg.TEST.IMS_PER_BATCH + assert ( + images_per_batch % num_gpus == 0 + ), "TEST.IMS_PER_BATCH ({}) must be divisible by the number " + "of GPUs ({}) used.".format(images_per_batch, num_gpus) + images_per_gpu = images_per_batch // num_gpus + shuffle = False if not is_distributed else True + shuffle = True + _num_iters = None + start_iter = 0 + + if images_per_gpu > 1: + logger = logging.getLogger(__name__) + logger.warning( + "When using more than one image per GPU you may encounter " + "an out-of-memory (OOM) error if your GPU does not have " + "sufficient memory. If this happens, you can reduce " + "SOLVER.IMS_PER_BATCH (for training) or " + "TEST.IMS_PER_BATCH (for inference). For training, you must " + "also adjust the learning rate and schedule length according " + "to the linear scaling rule. See for example: " + "https://github.com/facebookresearch/Detectron/blob/master/configs/getting_started/tutorial_1gpu_e2e_faster_rcnn_R-50-FPN.yaml#L14" + ) + + # group images which have similar aspect ratio. In this case, we only + # group in two cases: those with width / height > 1, and the other way around, + # but the code supports more general grouping strategy + aspect_grouping = [1] if cfg.DATALOADER.ASPECT_RATIO_GROUPING else [] + + paths_catalog = import_file( + "maskrcnn_benchmark.config.paths_catalog", cfg.PATHS_CATALOG, True + ) + DatasetCatalog = paths_catalog.DatasetCatalog + transforms = build_transforms(cfg, is_train) + + if not datasets: + dataset_list = cfg.DATASETS.TRAIN if is_train else cfg.DATASETS.TEST + datasets = build_dataset(cfg, dataset_list, transforms, DatasetCatalog, is_train) + + data_loaders = [] + for dataset in datasets: + sampler = make_data_sampler(dataset, shuffle, is_distributed) + if num_iters is not None: + _num_iters = num_iters + + batch_sampler = make_batch_data_sampler( + dataset, sampler, aspect_grouping, images_per_gpu, _num_iters, start_iter + ) + collator = BatchCollator(cfg.DATALOADER.SIZE_DIVISIBILITY) + num_workers = cfg.DATALOADER.NUM_WORKERS + data_loader = torch.utils.data.DataLoader( + dataset, + num_workers=num_workers, + batch_sampler=batch_sampler, + collate_fn=collator, + ) + data_loaders.append(data_loader) + if is_train: + # during training, a single (possibly concatenated) data_loader is returned + assert len(data_loaders) == 1 + return data_loaders[0] + return data_loaders diff --git a/maskrcnn_benchmark/data/collate_batch.py b/maskrcnn_benchmark/data/collate_batch.py new file mode 100644 index 0000000000000000000000000000000000000000..a7f03416741cfb4c04de613f7d2c8f2050258d73 --- /dev/null +++ b/maskrcnn_benchmark/data/collate_batch.py @@ -0,0 +1,20 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +from maskrcnn_benchmark.structures.image_list import to_image_list + + +class BatchCollator(object): + """ + From a list of samples from the dataset, + returns the batched images and targets. + This should be passed to the DataLoader + """ + + def __init__(self, size_divisible=0): + self.size_divisible = size_divisible + + def __call__(self, batch): + transposed_batch = list(zip(*batch)) + images = to_image_list(transposed_batch[0], self.size_divisible) + targets = transposed_batch[1] + img_ids = transposed_batch[2] + return images, targets, img_ids diff --git a/maskrcnn_benchmark/data/datasets/__init__.py b/maskrcnn_benchmark/data/datasets/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..3e40c6347f8e45167ca4d7920a1f724e525edf37 --- /dev/null +++ b/maskrcnn_benchmark/data/datasets/__init__.py @@ -0,0 +1,9 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +from .coco import COCODataset +from .voc import PascalVOCDataset +from .concat_dataset import ConcatDataset +from .rpc import RPCDataset, RPCTestDataset, RPCPseudoDataset, RPCInstanceSelectDataset, ImagesDataset +from .coco_density import COCODensityDataset, CocoUnlabelDataset + +__all__ = ["COCODataset", "ConcatDataset", "PascalVOCDataset", "RPCDataset", "COCODensityDataset", "CocoUnlabelDataset", + "RPCTestDataset", 'RPCPseudoDataset', 'RPCInstanceSelectDataset', 'ImagesDataset'] diff --git a/maskrcnn_benchmark/data/datasets/coco.py b/maskrcnn_benchmark/data/datasets/coco.py new file mode 100644 index 0000000000000000000000000000000000000000..3804b3919b54a81080283020af20c14572ebde4a --- /dev/null +++ b/maskrcnn_benchmark/data/datasets/coco.py @@ -0,0 +1,101 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import torch +import torchvision + +from maskrcnn_benchmark.structures.bounding_box import BoxList +from maskrcnn_benchmark.structures.segmentation_mask import SegmentationMask +from maskrcnn_benchmark.structures.keypoint import PersonKeypoints + + +min_keypoints_per_image = 10 + + +def _count_visible_keypoints(anno): + return sum(sum(1 for v in ann["keypoints"][2::3] if v > 0) for ann in anno) + + +def _has_only_empty_bbox(anno): + return all(any(o <= 1 for o in obj["bbox"][2:]) for obj in anno) + + +def has_valid_annotation(anno): + # if it's empty, there is no annotation + if len(anno) == 0: + return False + # if all boxes have close to zero area, there is no annotation + if _has_only_empty_bbox(anno): + return False + # keypoints task have a slight different critera for considering + # if an annotation is valid + if "keypoints" not in anno[0]: + return True + # for keypoint detection tasks, only consider valid images those + # containing at least min_keypoints_per_image + if _count_visible_keypoints(anno) >= min_keypoints_per_image: + return True + return False + + +class COCODataset(torchvision.datasets.coco.CocoDetection): + def __init__( + self, ann_file, root, remove_images_without_annotations, transforms=None + ): + super(COCODataset, self).__init__(root, ann_file) + # sort indices for reproducible results + self.ids = sorted(self.ids) + + # filter images without detection annotations + if remove_images_without_annotations: + ids = [] + for img_id in self.ids: + ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=None) + anno = self.coco.loadAnns(ann_ids) + if has_valid_annotation(anno): + ids.append(img_id) + self.ids = ids + + self.json_category_id_to_contiguous_id = { + v: i + 1 for i, v in enumerate(self.coco.getCatIds()) + } + self.contiguous_category_id_to_json_id = { + v: k for k, v in self.json_category_id_to_contiguous_id.items() + } + self.id_to_img_map = {k: v for k, v in enumerate(self.ids)} + self._transforms = transforms + + def __getitem__(self, idx): + img, anno = super(COCODataset, self).__getitem__(idx) + + # filter crowd annotations + # TODO might be better to add an extra field + anno = [obj for obj in anno if obj["iscrowd"] == 0] + + boxes = [obj["bbox"] for obj in anno] + boxes = torch.as_tensor(boxes).reshape(-1, 4) # guard against no boxes + target = BoxList(boxes, img.size, mode="xywh").convert("xyxy") + + classes = [obj["category_id"] for obj in anno] + classes = [self.json_category_id_to_contiguous_id[c] for c in classes] + classes = torch.tensor(classes) + target.add_field("labels", classes) + + masks = [obj["segmentation"] for obj in anno] + masks = SegmentationMask(masks, img.size) + target.add_field("masks", masks) + + if anno and "keypoints" in anno[0]: + keypoints = [obj["keypoints"] for obj in anno] + keypoints = PersonKeypoints(keypoints, img.size) + target.add_field("keypoints", keypoints) + + target = target.clip_to_image(remove_empty=True) + + if self._transforms is not None: + img, target = self._transforms(img, target) + + return img, target, idx + + def get_img_info(self, index): + img_id = self.id_to_img_map[index] + img_data = self.coco.imgs[img_id] + return img_data diff --git a/maskrcnn_benchmark/data/datasets/coco_density.py b/maskrcnn_benchmark/data/datasets/coco_density.py new file mode 100644 index 0000000000000000000000000000000000000000..c6d754f46a79f0c7076dacbb007b52ce968d68ca --- /dev/null +++ b/maskrcnn_benchmark/data/datasets/coco_density.py @@ -0,0 +1,278 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import json +import os +import random + +import numpy as np +import torch +import torchvision +from PIL import Image +from scipy import ndimage +from torch.utils.data import Dataset + +from maskrcnn_benchmark.structures.bounding_box import BoxList +from maskrcnn_benchmark.structures.segmentation_mask import Heatmap +from maskrcnn_benchmark.utils.density import contiguous_coco_category_to_super_category + +min_keypoints_per_image = 10 + + +def _count_visible_keypoints(anno): + return sum(sum(1 for v in ann["keypoints"][2::3] if v > 0) for ann in anno) + + +def _has_only_empty_bbox(anno): + return all(any(o <= 1 for o in obj["bbox"][2:]) for obj in anno) + + +def has_valid_annotation(anno): + # if it's empty, there is no annotation + if len(anno) == 0: + return False + # if all boxes have close to zero area, there is no annotation + if _has_only_empty_bbox(anno): + return False + # keypoints task have a slight different critera for considering + # if an annotation is valid + if "keypoints" not in anno[0]: + return True + # for keypoint detection tasks, only consider valid images those + # containing at least min_keypoints_per_image + if _count_visible_keypoints(anno) >= min_keypoints_per_image: + return True + return False + + +def generate_density_map(labels, boxes, scale, size, num_classes=1, min_sigma=1.): + height, width = size + scale_h, scale_w = scale + density_map = np.zeros((num_classes, height, width), dtype=np.float32) + for category, box in zip(labels, boxes): + x1, y1, x2, y2 = box + x1 *= scale_w + x2 *= scale_w + y1 *= scale_h + y2 *= scale_h + w, h = x2 - x1, y2 - y1 + box_radius = min(w, h) / 2 + sigma = max(min_sigma, box_radius * 5 / (4 * 3)) # 3/5 of gaussian kernel is in box + cx, cy = round((x1 + x2) / 2), round((y1 + y2) / 2) + density = np.zeros((height, width), dtype=np.float32) + density[min(cy, height - 1), min(cx, width - 1)] = 1 + density = ndimage.filters.gaussian_filter(density, sigma, mode='constant') + density_map[category, :, :] += density + + return density_map + + +class Resize(object): + def __init__(self, min_size=800, max_size=1333): + if not isinstance(min_size, (list, tuple)): + min_size = (min_size,) + self.min_size = min_size + self.max_size = max_size + + # modified from torchvision to add support for max size + def get_size(self, image_size): + w, h = image_size + size = random.choice(self.min_size) + max_size = self.max_size + if max_size is not None: + min_original_size = float(min((w, h))) + max_original_size = float(max((w, h))) + if max_original_size / min_original_size * size > max_size: + size = int(round(max_size * min_original_size / max_original_size)) + + if (w <= h and w == size) or (h <= w and h == size): + return (h, w) + + if w < h: + ow = size + oh = int(size * h / w) + else: + oh = size + ow = int(size * w / h) + + return oh, ow + + def __call__(self, size): + size = self.get_size(size) + return size + + +class COCODensityDataset(torchvision.datasets.coco.CocoDetection): + def __init__( + self, ann_file, root, remove_images_without_annotations, use_density_map=True, transforms=None + ): + super(COCODensityDataset, self).__init__(root, ann_file) + # sort indices for reproducible results + self.use_density_map = use_density_map + self.density_map_size = 100 + self.ids = sorted(self.ids) + self.density_categories = 1 + self.density_map_stride = 1.0 / 8 + self.density_min_sigma = 1.0 + + # filter images without detection annotations + if remove_images_without_annotations: + ids = [] + for img_id in self.ids: + ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=None) + anno = self.coco.loadAnns(ann_ids) + if has_valid_annotation(anno): + ids.append(img_id) + self.ids = ids + + self.json_category_id_to_contiguous_id = { + v: i + 1 for i, v in enumerate(self.coco.getCatIds()) + } + self.contiguous_category_id_to_json_id = { + v: k for k, v in self.json_category_id_to_contiguous_id.items() + } + self.id_to_img_map = {k: v for k, v in enumerate(self.ids)} + self._transforms = transforms + + def get_annotation(self, image_id): + coco = self.coco + ann_ids = coco.getAnnIds(imgIds=image_id) + img_data = self.coco.imgs[image_id] + anno = coco.loadAnns(ann_ids) + boxes = [obj["bbox"] for obj in anno] + boxes = torch.as_tensor(boxes).reshape(-1, 4) # guard against no boxes + target = BoxList(boxes, (img_data['width'], img_data['height']), mode="xywh").convert("xyxy") + + labels = [obj["category_id"] for obj in anno] + labels = [self.json_category_id_to_contiguous_id[c] for c in labels] + target.add_field("labels", torch.tensor(labels)) + + target = target.clip_to_image(remove_empty=True) + + return {'boxes': target.bbox.tolist(), 'labels': target.get_field('labels').tolist()} + + def __getitem__(self, idx): + img, anno = super(COCODensityDataset, self).__getitem__(idx) + width, height = img.size[0], img.size[1] + # filter crowd annotations + # TODO might be better to add an extra field + anno = [obj for obj in anno if obj["iscrowd"] == 0] + + boxes = [obj["bbox"] for obj in anno] + boxes = torch.as_tensor(boxes).reshape(-1, 4) # guard against no boxes + target = BoxList(boxes, img.size, mode="xywh").convert("xyxy") + + classes = [obj["category_id"] for obj in anno] + labels = classes = [self.json_category_id_to_contiguous_id[c] for c in classes] + classes = torch.tensor(classes) + target.add_field("labels", classes) + + target = target.clip_to_image(remove_empty=True) + + if self.use_density_map: + resize = Resize() + input_height, input_width = resize((width, height)) + stride = self.density_map_stride + output_height, output_width = round(input_height * stride), round(input_width * stride) + size = (output_height, output_width) + scale = (output_height / height, output_width / width) + super_categories = [contiguous_coco_category_to_super_category(category, self.density_categories) for category in labels] + density_map = generate_density_map(super_categories, target.bbox.tolist(), scale=scale, size=size, + num_classes=self.density_categories, min_sigma=self.density_min_sigma) + target.add_field('heatmap', Heatmap(torch.from_numpy(density_map))) + + if self._transforms is not None: + img, target = self._transforms(img, target) + + return img, target, idx + + def get_img_info(self, index): + img_id = self.id_to_img_map[index] + img_data = self.coco.imgs[img_id] + return img_data + + +class CocoUnlabelDataset(Dataset): + def __init__(self, img_dir, ann_file, pseudo_labels_file=None, use_density_map=True, transforms=None): + """ + Args: + img_dir: + ann_file: dict_keys(['info', 'images', 'licenses']) + {'license': 2, 'file_name': '000000533083.jpg', 'coco_url': 'http://images.cocodataset.org/unlabeled2017/000000533083.jpg', 'height': 640, 'width': 426, 'date_captured': '2013-11-14 10:56:14', + 'flickr_url': 'http://farm3.staticflickr.com/2567/4077404434_1bdea2d393_z.jpg', 'id': 533083} + """ + from pycocotools.coco import COCO + self.img_dir = img_dir + self.use_density_map = use_density_map + self.transforms = transforms + self.coco = COCO('/data7/lufficc/coco/annotations/instances_minival2014.json') + self.json_category_id_to_contiguous_id = { + v: i + 1 for i, v in enumerate(self.coco.getCatIds()) + } + self.contiguous_category_id_to_json_id = { + v: k for k, v in self.json_category_id_to_contiguous_id.items() + } + with open(ann_file) as f: + annotations = json.load(f) + self.images = annotations['images'] + + self.images_dict = {} + for i in self.images: + self.images_dict[i['id']] = i + + if pseudo_labels_file is None: + self.annotations = None + else: + self.images = [] + with open(pseudo_labels_file) as fid: + annotations = {} + anns = json.load(fid) + for ann in anns: + if len(ann['bbox']) > 0: + img_id = ann['id'] + img_info = self.images_dict[img_id] + self.images.append(img_info) + annotations[img_id] = ann + self.annotations = annotations + print('Pseudo labels: ', len(self.annotations)) + + def __len__(self): + return len(self.images) + + def __getitem__(self, index): + img_info = self.images[index] + img_path = os.path.join(self.img_dir, img_info['file_name']) + img = Image.open(img_path).convert("RGB") + width, height = img.size[0], img.size[1] + if self.annotations is None: + target = BoxList(torch.empty((0, 4), dtype=torch.float32), (width, height), mode="xyxy") + else: + boxes = [] + labels = [] + ann = self.annotations[img_info['id']] + for category, x, y, w, h in ann['bbox']: + boxes.append([x, y, x + w, y + h]) + labels.append(category) + + target = BoxList(torch.tensor(boxes, dtype=torch.float32).reshape((-1, 4)), (width, height), mode="xyxy") + target.add_field('labels', torch.tensor(labels)) + target = target.clip_to_image(remove_empty=True) + + if self.use_density_map: + resize = Resize() + input_height, input_width = resize((width, height)) + stride = 1.0 / 8 + output_height, output_width = round(input_height * stride), round(input_width * stride) + size = (output_height, output_width) + scale = (output_height / height, output_width / width) + num_classes = 1 + super_categories = [0] * len(labels) + density_map = generate_density_map(super_categories, target.bbox.tolist(), scale=scale, size=size, num_classes=num_classes) + target.add_field('heatmap', Heatmap(torch.from_numpy(density_map))) + + if self.transforms is not None: + img, target = self.transforms(img, target) + + return img, target, index + + def get_img_info(self, index): + img_info = self.images[index] + return img_info diff --git a/maskrcnn_benchmark/data/datasets/concat_dataset.py b/maskrcnn_benchmark/data/datasets/concat_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..e5e087c42036f27132ca2c6e1d5252af5fee4a97 --- /dev/null +++ b/maskrcnn_benchmark/data/datasets/concat_dataset.py @@ -0,0 +1,23 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import bisect + +from torch.utils.data.dataset import ConcatDataset as _ConcatDataset + + +class ConcatDataset(_ConcatDataset): + """ + Same as torch.utils.data.dataset.ConcatDataset, but exposes an extra + method for querying the sizes of the image + """ + + def get_idxs(self, idx): + dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx) + if dataset_idx == 0: + sample_idx = idx + else: + sample_idx = idx - self.cumulative_sizes[dataset_idx - 1] + return dataset_idx, sample_idx + + def get_img_info(self, idx): + dataset_idx, sample_idx = self.get_idxs(idx) + return self.datasets[dataset_idx].get_img_info(sample_idx) diff --git a/maskrcnn_benchmark/data/datasets/evaluation/__init__.py b/maskrcnn_benchmark/data/datasets/evaluation/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..97e811dc658b9d4e794b767d09711079d10d5fdb --- /dev/null +++ b/maskrcnn_benchmark/data/datasets/evaluation/__init__.py @@ -0,0 +1,38 @@ +from maskrcnn_benchmark.data import datasets + +from .coco import coco_evaluation +from .voc import voc_evaluation +from .rpc import rpc_evaluation +from .coco_density import coco_density_evaluation + + +def evaluate(dataset, predictions, output_folder, **kwargs): + """evaluate dataset using different methods based on dataset type. + Args: + dataset: Dataset object + predictions(list[BoxList]): each item in the list represents the + prediction results for one image. + output_folder: output folder, to save evaluation files or results. + **kwargs: other args. + Returns: + evaluation result + """ + args = dict( + dataset=dataset, predictions=predictions, output_folder=output_folder, **kwargs + ) + if isinstance(dataset, datasets.COCODataset): + return coco_evaluation(**args) + elif isinstance(dataset, datasets.PascalVOCDataset): + return voc_evaluation(**args) + elif isinstance(dataset, datasets.RPCTestDataset): + return rpc_evaluation(**args) + elif isinstance(dataset, datasets.COCODensityDataset): + return coco_density_evaluation(**args) + elif isinstance(dataset, datasets.CocoUnlabelDataset): + args.update({ + 'has_annotation': False, + }) + return coco_density_evaluation(**args) + else: + dataset_name = dataset.__class__.__name__ + raise NotImplementedError("Unsupported dataset type {}.".format(dataset_name)) diff --git a/maskrcnn_benchmark/data/datasets/evaluation/coco/__init__.py b/maskrcnn_benchmark/data/datasets/evaluation/coco/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..23f8b97578fd6b096867b1c0ad60d9bc56d44e5a --- /dev/null +++ b/maskrcnn_benchmark/data/datasets/evaluation/coco/__init__.py @@ -0,0 +1,22 @@ +from .coco_eval import do_coco_evaluation + + +def coco_evaluation( + dataset, + predictions, + output_folder, + box_only, + iou_types, + expected_results, + expected_results_sigma_tol, + **_, +): + return do_coco_evaluation( + dataset=dataset, + predictions=predictions, + box_only=box_only, + output_folder=output_folder, + iou_types=iou_types, + expected_results=expected_results, + expected_results_sigma_tol=expected_results_sigma_tol, + ) diff --git a/maskrcnn_benchmark/data/datasets/evaluation/coco/coco_eval.py b/maskrcnn_benchmark/data/datasets/evaluation/coco/coco_eval.py new file mode 100644 index 0000000000000000000000000000000000000000..ca3948dd73951cda0fe027fb9df39e8acc54fa73 --- /dev/null +++ b/maskrcnn_benchmark/data/datasets/evaluation/coco/coco_eval.py @@ -0,0 +1,407 @@ +import logging +import tempfile +import os +from datetime import datetime + +import torch +from collections import OrderedDict +from tqdm import tqdm + +from maskrcnn_benchmark.modeling.roi_heads.mask_head.inference import Masker +from maskrcnn_benchmark.structures.bounding_box import BoxList +from maskrcnn_benchmark.structures.boxlist_ops import boxlist_iou + + +def do_coco_evaluation( + dataset, + predictions, + box_only, + output_folder, + iou_types, + expected_results, + expected_results_sigma_tol, +): + logger = logging.getLogger("maskrcnn_benchmark.inference") + + if box_only: + logger.info("Evaluating bbox proposals") + areas = {"all": "", "small": "s", "medium": "m", "large": "l"} + res = COCOResults("box_proposal") + for limit in [100, 1000]: + for area, suffix in areas.items(): + stats = evaluate_box_proposals( + predictions, dataset, area=area, limit=limit + ) + key = "AR{}@{:d}".format(suffix, limit) + res.results["box_proposal"][key] = stats["ar"].item() + logger.info(res) + check_expected_results(res, expected_results, expected_results_sigma_tol) + if output_folder: + path = os.path.join(output_folder, "box_proposals_result_{}.txt".format(datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))) + with open(path, "w") as f: + f.write(str(res)) + torch.save(res, os.path.join(output_folder, "box_proposals.pth")) + eval_result = dict(metrics=res.results) + return eval_result + logger.info("Preparing results for COCO format") + coco_results = {} + if "bbox" in iou_types: + logger.info("Preparing bbox results") + coco_results["bbox"] = prepare_for_coco_detection(predictions, dataset) + if "segm" in iou_types: + logger.info("Preparing segm results") + coco_results["segm"] = prepare_for_coco_segmentation(predictions, dataset) + if 'keypoints' in iou_types: + logger.info('Preparing keypoints results') + coco_results['keypoints'] = prepare_for_coco_keypoint(predictions, dataset) + + results = COCOResults(*iou_types) + logger.info("Evaluating predictions") + for iou_type in iou_types: + with tempfile.NamedTemporaryFile() as f: + file_path = f.name + if output_folder: + file_path = os.path.join(output_folder, iou_type + ".json") + res = evaluate_predictions_on_coco( + dataset.coco, coco_results[iou_type], file_path, iou_type + ) + results.update(res) + logger.info(results) + check_expected_results(results, expected_results, expected_results_sigma_tol) + if output_folder: + path = os.path.join(output_folder, "coco_result_{}.txt".format(datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))) + with open(path, "w") as f: + f.write(str(results)) + torch.save(results, os.path.join(output_folder, "coco_results.pth")) + + eval_result = dict(metrics=results.results) + return eval_result + + +def prepare_for_coco_detection(predictions, dataset): + # assert isinstance(dataset, COCODataset) + coco_results = [] + for image_id, prediction in enumerate(predictions): + original_id = dataset.id_to_img_map[image_id] + if len(prediction) == 0: + continue + + img_info = dataset.get_img_info(image_id) + image_width = img_info["width"] + image_height = img_info["height"] + prediction = prediction.resize((image_width, image_height)) + prediction = prediction.convert("xywh") + + boxes = prediction.bbox.tolist() + scores = prediction.get_field("scores").tolist() + labels = prediction.get_field("labels").tolist() + + mapped_labels = [dataset.contiguous_category_id_to_json_id[i] for i in labels] + + coco_results.extend( + [ + { + "image_id": original_id, + "category_id": mapped_labels[k], + "bbox": box, + "score": scores[k], + } + for k, box in enumerate(boxes) + ] + ) + return coco_results + + +def prepare_for_coco_segmentation(predictions, dataset): + import pycocotools.mask as mask_util + import numpy as np + + masker = Masker(threshold=0.5, padding=1) + # assert isinstance(dataset, COCODataset) + coco_results = [] + for image_id, prediction in tqdm(enumerate(predictions)): + original_id = dataset.id_to_img_map[image_id] + if len(prediction) == 0: + continue + + img_info = dataset.get_img_info(image_id) + image_width = img_info["width"] + image_height = img_info["height"] + prediction = prediction.resize((image_width, image_height)) + masks = prediction.get_field("mask") + # t = time.time() + # Masker is necessary only if masks haven't been already resized. + if list(masks.shape[-2:]) != [image_height, image_width]: + masks = masker(masks.expand(1, -1, -1, -1, -1), prediction) + masks = masks[0] + # logger.info('Time mask: {}'.format(time.time() - t)) + # prediction = prediction.convert('xywh') + + # boxes = prediction.bbox.tolist() + scores = prediction.get_field("scores").tolist() + labels = prediction.get_field("labels").tolist() + + # rles = prediction.get_field('mask') + + rles = [ + mask_util.encode(np.array(mask[0, :, :, np.newaxis], order="F"))[0] + for mask in masks + ] + for rle in rles: + rle["counts"] = rle["counts"].decode("utf-8") + + mapped_labels = [dataset.contiguous_category_id_to_json_id[i] for i in labels] + + coco_results.extend( + [ + { + "image_id": original_id, + "category_id": mapped_labels[k], + "segmentation": rle, + "score": scores[k], + } + for k, rle in enumerate(rles) + ] + ) + return coco_results + + +def prepare_for_coco_keypoint(predictions, dataset): + # assert isinstance(dataset, COCODataset) + coco_results = [] + for image_id, prediction in enumerate(predictions): + original_id = dataset.id_to_img_map[image_id] + if len(prediction.bbox) == 0: + continue + + # TODO replace with get_img_info? + image_width = dataset.coco.imgs[original_id]['width'] + image_height = dataset.coco.imgs[original_id]['height'] + prediction = prediction.resize((image_width, image_height)) + prediction = prediction.convert('xywh') + + boxes = prediction.bbox.tolist() + scores = prediction.get_field('scores').tolist() + labels = prediction.get_field('labels').tolist() + keypoints = prediction.get_field('keypoints') + keypoints = keypoints.resize((image_width, image_height)) + keypoints = keypoints.keypoints.view(keypoints.keypoints.shape[0], -1).tolist() + + mapped_labels = [dataset.contiguous_category_id_to_json_id[i] for i in labels] + + coco_results.extend([{ + 'image_id': original_id, + 'category_id': mapped_labels[k], + 'keypoints': keypoint, + 'score': scores[k]} for k, keypoint in enumerate(keypoints)]) + return coco_results + +# inspired from Detectron +def evaluate_box_proposals( + predictions, dataset, thresholds=None, area="all", limit=None +): + """Evaluate detection proposal recall metrics. This function is a much + faster alternative to the official COCO API recall evaluation code. However, + it produces slightly different results. + """ + # Record max overlap value for each gt box + # Return vector of overlap values + areas = { + "all": 0, + "small": 1, + "medium": 2, + "large": 3, + "96-128": 4, + "128-256": 5, + "256-512": 6, + "512-inf": 7, + } + area_ranges = [ + [0 ** 2, 1e5 ** 2], # all + [0 ** 2, 32 ** 2], # small + [32 ** 2, 96 ** 2], # medium + [96 ** 2, 1e5 ** 2], # large + [96 ** 2, 128 ** 2], # 96-128 + [128 ** 2, 256 ** 2], # 128-256 + [256 ** 2, 512 ** 2], # 256-512 + [512 ** 2, 1e5 ** 2], + ] # 512-inf + assert area in areas, "Unknown area range: {}".format(area) + area_range = area_ranges[areas[area]] + gt_overlaps = [] + num_pos = 0 + + for image_id, prediction in enumerate(predictions): + original_id = dataset.id_to_img_map[image_id] + + img_info = dataset.get_img_info(image_id) + image_width = img_info["width"] + image_height = img_info["height"] + prediction = prediction.resize((image_width, image_height)) + + # sort predictions in descending order + # TODO maybe remove this and make it explicit in the documentation + inds = prediction.get_field("objectness").sort(descending=True)[1] + prediction = prediction[inds] + + ann_ids = dataset.coco.getAnnIds(imgIds=original_id) + anno = dataset.coco.loadAnns(ann_ids) + gt_boxes = [obj["bbox"] for obj in anno if obj["iscrowd"] == 0] + gt_boxes = torch.as_tensor(gt_boxes).reshape(-1, 4) # guard against no boxes + gt_boxes = BoxList(gt_boxes, (image_width, image_height), mode="xywh").convert( + "xyxy" + ) + gt_areas = torch.as_tensor([obj["area"] for obj in anno if obj["iscrowd"] == 0]) + + if len(gt_boxes) == 0: + continue + + valid_gt_inds = (gt_areas >= area_range[0]) & (gt_areas <= area_range[1]) + gt_boxes = gt_boxes[valid_gt_inds] + + num_pos += len(gt_boxes) + + if len(gt_boxes) == 0: + continue + + if len(prediction) == 0: + continue + + if limit is not None and len(prediction) > limit: + prediction = prediction[:limit] + + overlaps = boxlist_iou(prediction, gt_boxes) + + _gt_overlaps = torch.zeros(len(gt_boxes)) + for j in range(min(len(prediction), len(gt_boxes))): + # find which proposal box maximally covers each gt box + # and get the iou amount of coverage for each gt box + max_overlaps, argmax_overlaps = overlaps.max(dim=0) + + # find which gt box is 'best' covered (i.e. 'best' = most iou) + gt_ovr, gt_ind = max_overlaps.max(dim=0) + assert gt_ovr >= 0 + # find the proposal box that covers the best covered gt box + box_ind = argmax_overlaps[gt_ind] + # record the iou coverage of this gt box + _gt_overlaps[j] = overlaps[box_ind, gt_ind] + assert _gt_overlaps[j] == gt_ovr + # mark the proposal box and the gt box as used + overlaps[box_ind, :] = -1 + overlaps[:, gt_ind] = -1 + + # append recorded iou coverage level + gt_overlaps.append(_gt_overlaps) + gt_overlaps = torch.cat(gt_overlaps, dim=0) + gt_overlaps, _ = torch.sort(gt_overlaps) + + if thresholds is None: + step = 0.05 + thresholds = torch.arange(0.5, 0.95 + 1e-5, step, dtype=torch.float32) + recalls = torch.zeros_like(thresholds) + # compute recall for each iou threshold + for i, t in enumerate(thresholds): + recalls[i] = (gt_overlaps >= t).float().sum() / float(num_pos) + # ar = 2 * np.trapz(recalls, thresholds) + ar = recalls.mean() + return { + "ar": ar, + "recalls": recalls, + "thresholds": thresholds, + "gt_overlaps": gt_overlaps, + "num_pos": num_pos, + } + + +def evaluate_predictions_on_coco( + coco_gt, coco_results, json_result_file, iou_type="bbox" +): + import json + + with open(json_result_file, "w") as f: + json.dump(coco_results, f) + + from pycocotools.coco import COCO + from pycocotools.cocoeval import COCOeval + + coco_dt = coco_gt.loadRes(str(json_result_file)) if coco_results else COCO() + + # coco_dt = coco_gt.loadRes(coco_results) + coco_eval = COCOeval(coco_gt, coco_dt, iou_type) + coco_eval.evaluate() + coco_eval.accumulate() + coco_eval.summarize() + return coco_eval + + +class COCOResults(object): + METRICS = { + "bbox": ["AP", "AP50", "AP75", "APs", "APm", "APl"], + "segm": ["AP", "AP50", "AP75", "APs", "APm", "APl"], + "box_proposal": [ + "AR@100", + "ARs@100", + "ARm@100", + "ARl@100", + "AR@1000", + "ARs@1000", + "ARm@1000", + "ARl@1000", + ], + "keypoints": ["AP", "AP50", "AP75", "APm", "APl"], + } + + def __init__(self, *iou_types): + allowed_types = ("box_proposal", "bbox", "segm", "keypoints") + assert all(iou_type in allowed_types for iou_type in iou_types) + results = OrderedDict() + for iou_type in iou_types: + results[iou_type] = OrderedDict( + [(metric, -1) for metric in COCOResults.METRICS[iou_type]] + ) + self.results = results + + def update(self, coco_eval): + if coco_eval is None: + return + from pycocotools.cocoeval import COCOeval + + assert isinstance(coco_eval, COCOeval) + s = coco_eval.stats + iou_type = coco_eval.params.iouType + res = self.results[iou_type] + metrics = COCOResults.METRICS[iou_type] + for idx, metric in enumerate(metrics): + res[metric] = s[idx] + + def __repr__(self): + result_str = '' + for iou_type in self.results: + result_str += (iou_type + ':\n') + metrics = self.results[iou_type] + for metric in metrics: + result_str += '{:<10}: {}\n'.format(metric, round(metrics[metric], 3)) + result_str += ('-' * 32 + '\n') + return result_str + + +def check_expected_results(results, expected_results, sigma_tol): + if not expected_results: + return + + logger = logging.getLogger("maskrcnn_benchmark.inference") + for task, metric, (mean, std) in expected_results: + actual_val = results.results[task][metric] + lo = mean - sigma_tol * std + hi = mean + sigma_tol * std + ok = (lo < actual_val) and (actual_val < hi) + msg = ( + "{} > {} sanity check (actual vs. expected): " + "{:.3f} vs. mean={:.4f}, std={:.4}, range=({:.4f}, {:.4f})" + ).format(task, metric, actual_val, mean, std, lo, hi) + if not ok: + msg = "FAIL: " + msg + logger.error(msg) + else: + msg = "PASS: " + msg + logger.info(msg) diff --git a/maskrcnn_benchmark/data/datasets/evaluation/coco_density/__init__.py b/maskrcnn_benchmark/data/datasets/evaluation/coco_density/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4b9be23ea216d503e6238d1d681fb85e75e98548 --- /dev/null +++ b/maskrcnn_benchmark/data/datasets/evaluation/coco_density/__init__.py @@ -0,0 +1,171 @@ +import json +import logging +import os +from datetime import datetime + +import numpy as np +from pycocotools.coco import COCO +from pycocotools.cocoeval import COCOeval + +from maskrcnn_benchmark.utils.density import contiguous_coco_category_to_super_category + + +def coco_density_evaluation(dataset, predictions, output_folder, iteration=-1, generate_pseudo_labels=True, has_annotation=True, **_): + logger = logging.getLogger("maskrcnn_benchmark.inference") + use_ground_truth = False + threshold = 0.95 + coco_results = [] + annotations = [] + density_correct = 0 + box_correct = 0 + mae = 0 # MEAN ABSOLUTE ERROR + metrics = {} + num_density_classes = 1 + has_density_map = predictions[0].has_field('density') + if has_density_map: + num_density_classes = predictions[0].get_field('density').shape[-1] + logger.info('Density category: {}'.format(num_density_classes)) + for image_id, prediction in enumerate(predictions): + if len(prediction) == 0: + continue + + img_info = dataset.get_img_info(image_id) + original_id = img_info['id'] + image_width = img_info["width"] + image_height = img_info["height"] + prediction = prediction.resize((image_width, image_height)) + prediction = prediction.convert("xywh") + + boxes = prediction.bbox.tolist() + scores = prediction.get_field("scores").tolist() + labels = prediction.get_field("labels").tolist() + + mapped_labels = [dataset.contiguous_category_id_to_json_id[i] for i in labels] + + gt_super_cat_counts = np.zeros((num_density_classes,), dtype=np.int32) + gt_all_cat_counts = np.zeros((81,), dtype=np.int32) + pred_super_cat_counts = np.zeros((num_density_classes,), dtype=np.int32) + if has_density_map: + pred_super_cat_counts = prediction.get_field('density').numpy() + pred_super_cat_counts = np.round(pred_super_cat_counts).astype(np.int32) + if has_annotation: + ann = dataset.get_annotation(img_info['id']) + for category in ann['labels']: + super_category = contiguous_coco_category_to_super_category(category, num_classes=num_density_classes) + gt_all_cat_counts[category] += 1 + gt_super_cat_counts[super_category] += 1 + + is_correct = np.all(gt_super_cat_counts == pred_super_cat_counts) + if is_correct: + density_correct += 1 + else: + mae += np.sum(np.abs(gt_super_cat_counts - pred_super_cat_counts)) + + box_super_cat_counts = np.zeros((num_density_classes,), dtype=np.int32) + box_all_cat_counts = np.zeros((81,), dtype=np.int32) + if generate_pseudo_labels and has_density_map: + image_result = { + 'bbox': [], + 'width': image_width, + 'height': image_height, + 'id': img_info['id'], + 'file_name': img_info['file_name'], + } + + for i in range(len(prediction)): + score = scores[i] + if score > threshold: + box = boxes[i] + label = labels[i] + super_category = contiguous_coco_category_to_super_category(label, num_classes=num_density_classes) + box_all_cat_counts[label] += 1 + box_super_cat_counts[super_category] += 1 + x, y, width, height = box + image_result['bbox'].append( + (label, x, y, width, height) + ) + if use_ground_truth and has_annotation: + is_valid = np.all(box_all_cat_counts == gt_all_cat_counts) + else: + is_valid = np.all(box_super_cat_counts == pred_super_cat_counts) + if is_valid: + annotations.append(image_result) + if has_annotation: + is_box_correct = np.all(box_all_cat_counts == gt_all_cat_counts) + if is_box_correct: + box_correct += 1 + + coco_results.extend( + [ + { + "image_id": original_id, + "category_id": mapped_labels[k], + "bbox": box, + "score": scores[k], + } + for k, box in enumerate(boxes) + ] + ) + + if has_density_map: + metrics['ratio'] = density_correct / len(predictions) + metrics['mae'] = mae / len(predictions) + logger.info('Density Ratio: {:.3f}'.format(density_correct / len(predictions))) + logger.info('Density MAE : {:.3f} '.format(mae / len(predictions))) + if generate_pseudo_labels: + if len(annotations) == 0: + logger.info('No annotations are selected.') + else: + metrics['select_ratio'] = box_correct / len(annotations) + metrics['pseudo_labels'] = len(annotations) + logger.info( + 'Select Ratio: {:.3f} ({}/{}, {:.5f} Threshold)'.format(box_correct / len(annotations), + box_correct, + len(annotations), + threshold)) + + time_stamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S') + if len(coco_results) == 0: + logger.info('Nothing detected.') + with open(os.path.join(output_folder, 'result_{}.txt'.format(time_stamp)), 'w') as fid: + fid.write('Nothing detected.') + return dict(metrics={}) + + if generate_pseudo_labels: + logger.info('Pseudo-Labeling: {}'.format(len(annotations))) + with open(os.path.join(output_folder, 'pseudo_labeling.json'), 'w') as fid: + json.dump(annotations, fid) + + if not has_annotation: + return dict(metrics=metrics) + + file_path = os.path.join(output_folder, "bbox.json") + + with open(file_path, "w") as f: + json.dump(coco_results, f) + + coco_gt = dataset.coco + coco_dt = coco_gt.loadRes(str(file_path)) if coco_results else COCO() + + coco_eval = COCOeval(coco_gt, coco_dt, 'bbox') + coco_eval.evaluate() + coco_eval.accumulate() + coco_eval.summarize() + + result_strings = [] + keys = ["AP", "AP50", "AP75", "APs", "APm", "APl"] + + for i, key in enumerate(keys): + metrics[key] = coco_eval.stats[i] + logger.info('{:<10}: {}'.format(key, round(coco_eval.stats[i], 3))) + result_strings.append('{:<10}: {}'.format(key, round(coco_eval.stats[i], 3))) + + if iteration > 0: + filename = os.path.join(output_folder, 'result_{:07d}.txt'.format(iteration)) + else: + filename = os.path.join(output_folder, 'result_{}.txt'.format(time_stamp)) + + with open(filename, "w") as f: + f.write('\n'.join(result_strings)) + + return dict(metrics=metrics) diff --git a/maskrcnn_benchmark/data/datasets/evaluation/rpc/__init__.py b/maskrcnn_benchmark/data/datasets/evaluation/rpc/__init__.py new file mode 100755 index 0000000000000000000000000000000000000000..a18430f65751a750b32362eebed51854c23395ec --- /dev/null +++ b/maskrcnn_benchmark/data/datasets/evaluation/rpc/__init__.py @@ -0,0 +1,211 @@ +import json +import logging +import os +from datetime import datetime +import numpy as np + +import boxx +import rpctool +from tqdm import tqdm + +from maskrcnn_benchmark.utils.density import rpc_category_to_super_category + +LEVELS = ('easy', 'medium', 'hard', 'averaged') +NUM_CLASSES = 200 + 1 # 1-based +THRESHOLD = 0.95 + + +def get_cAcc(result, level): + index = LEVELS.index(level) + return float(result.loc[index, 'cAcc'].strip('%')) + + +def check_best_result(output_folder, result, result_str, filename): + current_cAcc = get_cAcc(result, 'averaged') + best_path = os.path.join(output_folder, 'best_result.txt') + if os.path.exists(best_path): + with open(best_path) as f: + best_cAcc = float(f.readline().strip()) + if current_cAcc >= best_cAcc: + best_cAcc = current_cAcc + with open(best_path, 'w') as f: + f.write(str(best_cAcc) + '\n' + filename + '\n' + result_str) + else: + best_cAcc = current_cAcc + with open(best_path, 'w') as f: + f.write(str(current_cAcc) + '\n' + filename + '\n' + result_str) + return best_cAcc + + +def rpc_evaluation(dataset, + predictions, + output_folder, + generate_pseudo_labels=False, + iteration=-1, + threshold=THRESHOLD, + use_ground_truth=False, # use ground truth to select pseudo labels + **_): + threshold = 0.9995 if threshold >= 1 else threshold + + logger = logging.getLogger("maskrcnn_benchmark.inference") + if generate_pseudo_labels: + logger.info('Use ground truth: {}'.format(use_ground_truth)) + + pred_boxlists = [] + annotations = [] + density_correct = 0 + box_correct = 0 + mae = 0 # MEAN ABSOLUTE ERROR + has_density_map = predictions[0].has_field('density') + num_density_classes = 1 + if has_density_map: + num_density_classes = predictions[0].get_field('density').shape[-1] + logger.info('Density category: {}'.format(num_density_classes)) + + for image_id, prediction in tqdm(enumerate(predictions)): + img_info = dataset.get_img_info(image_id) + + image_width = img_info["width"] + image_height = img_info["height"] + prediction = prediction.resize((image_width, image_height)) + bboxes = prediction.bbox.tolist() + labels = prediction.get_field("labels").tolist() + scores = prediction.get_field("scores").tolist() + + # -----------------------------------------------# + # -----------------Pseudo Label------------------# + # -----------------------------------------------# + + gt_density_cat_counts = np.zeros((num_density_classes,), dtype=np.int32) + gt_all_cat_counts = np.zeros((NUM_CLASSES,), dtype=np.int32) + pred_density_cat_counts = np.zeros((num_density_classes,), dtype=np.int32) + # density = 0.0 + if has_density_map: + pred_density_cat_counts = prediction.get_field('density').numpy() + pred_density_cat_counts = np.round(pred_density_cat_counts).astype(np.int32) + + ann = dataset.get_annotation(img_info['id']) + for category, x, y, w, h in ann: + density_category = rpc_category_to_super_category(category, num_classes=num_density_classes) + gt_all_cat_counts[category] += 1 + gt_density_cat_counts[density_category] += 1 + + is_correct = np.all(gt_density_cat_counts == pred_density_cat_counts) + if is_correct: + density_correct += 1 + else: + mae += np.sum(np.abs(gt_density_cat_counts - pred_density_cat_counts)) + + box_density_cat_counts = np.zeros((num_density_classes,), dtype=np.int32) + box_all_cat_counts = np.zeros((NUM_CLASSES,), dtype=np.int32) + + if generate_pseudo_labels and has_density_map: + image_result = { + 'bbox': [], + 'width': image_width, + 'height': image_height, + 'id': img_info['id'], + 'file_name': img_info['file_name'], + } + + for i in range(len(prediction)): + score = scores[i] + if score > threshold: + box = bboxes[i] + label = labels[i] + density_category = rpc_category_to_super_category(label, num_density_classes) + box_all_cat_counts[label] += 1 + box_density_cat_counts[density_category] += 1 + x, y, width, height = box[0], box[1], box[2] - box[0], box[3] - box[1] + image_result['bbox'].append( + (label, x, y, width, height) + ) + if use_ground_truth: + is_valid = np.all(box_all_cat_counts == gt_all_cat_counts) + else: + is_valid = np.all(box_density_cat_counts == pred_density_cat_counts) + if is_valid: + annotations.append(image_result) + is_box_correct = np.all(box_all_cat_counts == gt_all_cat_counts) + if is_box_correct: + box_correct += 1 + + # -----------------------------------------------# + # -----------------------------------------------# + # -----------------------------------------------# + + for i in range(len(prediction)): + score = scores[i] + box = bboxes[i] + label = labels[i] + + x, y, width, height = box[0], box[1], box[2] - box[0], box[3] - box[1] + + pred_boxlists.append({ + "image_id": img_info['id'], + "category_id": int(label), + "bbox": [float(k) for k in [x, y, width, height]], + "score": float(score), + }) + + if has_density_map: + logger.info('Density Ratio: {:.3f}'.format(density_correct / len(predictions))) + logger.info('Density MAE : {:.3f} '.format(mae / len(predictions))) + if generate_pseudo_labels: + if len(annotations) == 0: + logger.info('No annotations are selected.') + else: + logger.info( + 'Select Ratio: {:.3f} ({}/{}, {:.5f} Threshold)'.format(box_correct / len(annotations), + box_correct, + len(annotations), + threshold)) + + time_stamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S') + if len(pred_boxlists) == 0: + logger.info('Nothing detected.') + with open(os.path.join(output_folder, 'result_{}.txt'.format(time_stamp)), 'w') as fid: + fid.write('Nothing detected.') + return dict(metrics={}) + + if generate_pseudo_labels: + logger.info('Pseudo-Labeling: {}'.format(len(annotations))) + with open(os.path.join(output_folder, 'pseudo_labeling.json'), 'w') as fid: + json.dump(annotations, fid) + + save_path = os.path.join(output_folder, 'bbox_results.json') + with open(save_path, 'w') as fid: + json.dump(pred_boxlists, fid) + res_js = boxx.loadjson(save_path) + ann_js = boxx.loadjson(dataset.ann_file) + result = rpctool.evaluate(res_js, ann_js) + logger.info(result) + + result_str = str(result) + if iteration > 0: + filename = os.path.join(output_folder, 'result_{:07d}.txt'.format(iteration)) + else: + filename = os.path.join(output_folder, 'result_{}.txt'.format(time_stamp)) + + if has_density_map: + result_str += '\n' + 'Ratio: {:.3f}, '.format(density_correct / len(predictions)) + 'MAE: {:.3f} '.format(mae / len(predictions)) + with open(filename, 'w') as fid: + fid.write(result_str) + + best_cAcc = check_best_result(output_folder, result, result_str, filename) + logger.info('Best cAcc: {}%'.format(best_cAcc)) + metrics = { + 'cAcc': { + 'averaged': get_cAcc(result, 'averaged'), + 'hard': get_cAcc(result, 'hard'), + 'medium': get_cAcc(result, 'medium'), + 'easy': get_cAcc(result, 'easy'), + } + } + if has_density_map: + metrics.update({ + 'Ratio': density_correct / len(predictions), + 'MAE': mae / len(predictions), + }) + eval_result = dict(metrics=metrics) + return eval_result diff --git a/maskrcnn_benchmark/data/datasets/evaluation/voc/__init__.py b/maskrcnn_benchmark/data/datasets/evaluation/voc/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1dde6413aac50810e8c8de2d4c183bddc6363e00 --- /dev/null +++ b/maskrcnn_benchmark/data/datasets/evaluation/voc/__init__.py @@ -0,0 +1,16 @@ +import logging + +from .voc_eval import do_voc_evaluation + + +def voc_evaluation(dataset, predictions, output_folder, box_only, **_): + logger = logging.getLogger("maskrcnn_benchmark.inference") + if box_only: + logger.warning("voc evaluation doesn't support box_only, ignored.") + logger.info("performing voc evaluation, ignored iou_types.") + return do_voc_evaluation( + dataset=dataset, + predictions=predictions, + output_folder=output_folder, + logger=logger, + ) diff --git a/maskrcnn_benchmark/data/datasets/evaluation/voc/voc_eval.py b/maskrcnn_benchmark/data/datasets/evaluation/voc/voc_eval.py new file mode 100644 index 0000000000000000000000000000000000000000..334317483ab413f7347238bd6ae4d9704f083405 --- /dev/null +++ b/maskrcnn_benchmark/data/datasets/evaluation/voc/voc_eval.py @@ -0,0 +1,221 @@ +# A modification version from chainercv repository. +# (See https://github.com/chainer/chainercv/blob/master/chainercv/evaluations/eval_detection_voc.py) +from __future__ import division + +import os +from collections import defaultdict +from datetime import datetime + +import numpy as np +from maskrcnn_benchmark.structures.bounding_box import BoxList +from maskrcnn_benchmark.structures.boxlist_ops import boxlist_iou + + +def do_voc_evaluation(dataset, predictions, output_folder, logger): + # TODO need to make the use_07_metric format available + # for the user to choose + pred_boxlists = [] + gt_boxlists = [] + for image_id, prediction in enumerate(predictions): + img_info = dataset.get_img_info(image_id) + image_width = img_info["width"] + image_height = img_info["height"] + prediction = prediction.resize((image_width, image_height)) + pred_boxlists.append(prediction) + + gt_boxlist = dataset.get_groundtruth(image_id) + gt_boxlists.append(gt_boxlist) + result = eval_detection_voc( + pred_boxlists=pred_boxlists, + gt_boxlists=gt_boxlists, + iou_thresh=0.5, + use_07_metric=True, + ) + result_str = "mAP: {:.4f}\n".format(result["map"]) + metrics = {'mAP': result["map"]} + for i, ap in enumerate(result["ap"]): + if i == 0: # skip background + continue + metrics[dataset.map_class_id_to_class_name(i)] = ap + result_str += "{:<16}: {:.4f}\n".format( + dataset.map_class_id_to_class_name(i), ap + ) + logger.info(result_str) + if output_folder: + filename = "result_{}.txt".format(datetime.now().strftime('%Y-%m-%d_%H-%M-%S')) + with open(os.path.join(output_folder, filename), "w") as fid: + fid.write(result_str) + + eval_result = dict(metrics=metrics) + return eval_result + + +def eval_detection_voc(pred_boxlists, gt_boxlists, iou_thresh=0.5, use_07_metric=False): + """Evaluate on voc dataset. + Args: + pred_boxlists(list[BoxList]): pred boxlist, has labels and scores fields. + gt_boxlists(list[BoxList]): ground truth boxlist, has labels field. + iou_thresh: iou thresh + use_07_metric: boolean + Returns: + dict represents the results + """ + assert len(gt_boxlists) == len( + pred_boxlists + ), "Length of gt and pred lists need to be same." + prec, rec = calc_detection_voc_prec_rec( + pred_boxlists=pred_boxlists, gt_boxlists=gt_boxlists, iou_thresh=iou_thresh + ) + ap = calc_detection_voc_ap(prec, rec, use_07_metric=use_07_metric) + return {"ap": ap, "map": np.nanmean(ap)} + + +def calc_detection_voc_prec_rec(gt_boxlists, pred_boxlists, iou_thresh=0.5): + """Calculate precision and recall based on evaluation code of PASCAL VOC. + This function calculates precision and recall of + predicted bounding boxes obtained from a dataset which has :math:`N` + images. + The code is based on the evaluation code used in PASCAL VOC Challenge. + """ + n_pos = defaultdict(int) + score = defaultdict(list) + match = defaultdict(list) + for gt_boxlist, pred_boxlist in zip(gt_boxlists, pred_boxlists): + pred_bbox = pred_boxlist.bbox.numpy() + pred_label = pred_boxlist.get_field("labels").numpy() + pred_score = pred_boxlist.get_field("scores").numpy() + gt_bbox = gt_boxlist.bbox.numpy() + gt_label = gt_boxlist.get_field("labels").numpy() + gt_difficult = gt_boxlist.get_field("difficult").numpy() + + for l in np.unique(np.concatenate((pred_label, gt_label)).astype(int)): + pred_mask_l = pred_label == l + pred_bbox_l = pred_bbox[pred_mask_l] + pred_score_l = pred_score[pred_mask_l] + # sort by score + order = pred_score_l.argsort()[::-1] + pred_bbox_l = pred_bbox_l[order] + pred_score_l = pred_score_l[order] + + gt_mask_l = gt_label == l + gt_bbox_l = gt_bbox[gt_mask_l] + gt_difficult_l = gt_difficult[gt_mask_l] + + n_pos[l] += np.logical_not(gt_difficult_l).sum() + score[l].extend(pred_score_l) + + if len(pred_bbox_l) == 0: + continue + if len(gt_bbox_l) == 0: + match[l].extend((0,) * pred_bbox_l.shape[0]) + continue + + # VOC evaluation follows integer typed bounding boxes. + pred_bbox_l = pred_bbox_l.copy() + pred_bbox_l[:, 2:] += 1 + gt_bbox_l = gt_bbox_l.copy() + gt_bbox_l[:, 2:] += 1 + iou = boxlist_iou( + BoxList(pred_bbox_l, gt_boxlist.size), + BoxList(gt_bbox_l, gt_boxlist.size), + ).numpy() + gt_index = iou.argmax(axis=1) + # set -1 if there is no matching ground truth + gt_index[iou.max(axis=1) < iou_thresh] = -1 + del iou + + selec = np.zeros(gt_bbox_l.shape[0], dtype=bool) + for gt_idx in gt_index: + if gt_idx >= 0: + if gt_difficult_l[gt_idx]: + match[l].append(-1) + else: + if not selec[gt_idx]: + match[l].append(1) + else: + match[l].append(0) + selec[gt_idx] = True + else: + match[l].append(0) + + n_fg_class = max(n_pos.keys()) + 1 + prec = [None] * n_fg_class + rec = [None] * n_fg_class + + for l in n_pos.keys(): + score_l = np.array(score[l]) + match_l = np.array(match[l], dtype=np.int8) + + order = score_l.argsort()[::-1] + match_l = match_l[order] + + tp = np.cumsum(match_l == 1) + fp = np.cumsum(match_l == 0) + + # If an element of fp + tp is 0, + # the corresponding element of prec[l] is nan. + prec[l] = tp / (fp + tp) + # If n_pos[l] is 0, rec[l] is None. + if n_pos[l] > 0: + rec[l] = tp / n_pos[l] + + return prec, rec + + +def calc_detection_voc_ap(prec, rec, use_07_metric=False): + """Calculate average precisions based on evaluation code of PASCAL VOC. + This function calculates average precisions + from given precisions and recalls. + The code is based on the evaluation code used in PASCAL VOC Challenge. + Args: + prec (list of numpy.array): A list of arrays. + :obj:`prec[l]` indicates precision for class :math:`l`. + If :obj:`prec[l]` is :obj:`None`, this function returns + :obj:`numpy.nan` for class :math:`l`. + rec (list of numpy.array): A list of arrays. + :obj:`rec[l]` indicates recall for class :math:`l`. + If :obj:`rec[l]` is :obj:`None`, this function returns + :obj:`numpy.nan` for class :math:`l`. + use_07_metric (bool): Whether to use PASCAL VOC 2007 evaluation metric + for calculating average precision. The default value is + :obj:`False`. + Returns: + ~numpy.ndarray: + This function returns an array of average precisions. + The :math:`l`-th value corresponds to the average precision + for class :math:`l`. If :obj:`prec[l]` or :obj:`rec[l]` is + :obj:`None`, the corresponding value is set to :obj:`numpy.nan`. + """ + + n_fg_class = len(prec) + ap = np.empty(n_fg_class) + for l in range(n_fg_class): + if prec[l] is None or rec[l] is None: + ap[l] = np.nan + continue + + if use_07_metric: + # 11 point metric + ap[l] = 0 + for t in np.arange(0.0, 1.1, 0.1): + if np.sum(rec[l] >= t) == 0: + p = 0 + else: + p = np.max(np.nan_to_num(prec[l])[rec[l] >= t]) + ap[l] += p / 11 + else: + # correct AP calculation + # first append sentinel values at the end + mpre = np.concatenate(([0], np.nan_to_num(prec[l]), [0])) + mrec = np.concatenate(([0], rec[l], [1])) + + mpre = np.maximum.accumulate(mpre[::-1])[::-1] + + # to calculate area under PR curve, look for points + # where X axis (recall) changes value + i = np.where(mrec[1:] != mrec[:-1])[0] + + # and sum (\Delta recall) * prec + ap[l] = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) + + return ap diff --git a/maskrcnn_benchmark/data/datasets/list_dataset.py b/maskrcnn_benchmark/data/datasets/list_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..9058d35b3d4279048732074f4a8dbb6edd4c9ed0 --- /dev/null +++ b/maskrcnn_benchmark/data/datasets/list_dataset.py @@ -0,0 +1,36 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +""" +Simple dataset class that wraps a list of path names +""" + +from PIL import Image + +from maskrcnn_benchmark.structures.bounding_box import BoxList + + +class ListDataset(object): + def __init__(self, image_lists, transforms=None): + self.image_lists = image_lists + self.transforms = transforms + + def __getitem__(self, item): + img = Image.open(self.image_lists[item]).convert("RGB") + + # dummy target + w, h = img.size + target = BoxList([[0, 0, w, h]], img.size, mode="xyxy") + + if self.transforms is not None: + img, target = self.transforms(img, target) + + return img, target + + def __len__(self): + return len(self.image_lists) + + def get_img_info(self, item): + """ + Return the image dimensions for the image, without + loading and pre-processing it + """ + pass diff --git a/maskrcnn_benchmark/data/datasets/rpc.py b/maskrcnn_benchmark/data/datasets/rpc.py new file mode 100644 index 0000000000000000000000000000000000000000..115f0d65d02e7acd27c268da19c91cb6ebebcce8 --- /dev/null +++ b/maskrcnn_benchmark/data/datasets/rpc.py @@ -0,0 +1,320 @@ +import glob +import json +import os +import random +from collections import defaultdict + +import numpy as np +import torch +import torch.utils.data +from PIL import Image + +from maskrcnn_benchmark.structures.bounding_box import BoxList +from maskrcnn_benchmark.structures.segmentation_mask import Heatmap +from maskrcnn_benchmark.utils.density import generate_density_map, rpc_category_to_super_category + +DENSITY_MAP_WIDTH = 100 +DENSITY_MAP_HEIGHT = 100 + + +# -------------------------------------------- +# ----------------Test dataset---------------- +# -------------------------------------------- +class RPCTestDataset(torch.utils.data.Dataset): + def __init__(self, images_dir, ann_file, transforms=None): + self.transforms = transforms + self.images_dir = images_dir + self.ann_file = ann_file + + with open(self.ann_file) as fid: + data = json.load(fid) + + annotations = defaultdict(list) + images = [] + for image in data['images']: + images.append(image) + for ann in data['annotations']: + bbox = ann['bbox'] + x, y, w, h = bbox[0], bbox[1], bbox[2], bbox[3] + annotations[ann['image_id']].append((ann['category_id'], x, y, w, h)) + + self.images = images + self.annotations = dict(annotations) + + def __getitem__(self, index): + image_id = self.images[index]['id'] + img_path = os.path.join(self.images_dir, self.images[index]['file_name']) + img = Image.open(img_path).convert("RGB") + width, height = img.size[0], img.size[1] + boxes = [] + labels = [] + ann = self.annotations[image_id] + for category, x, y, w, h in ann: + boxes.append([x, y, x + w, y + h]) + labels.append(category) + + target = BoxList(torch.tensor(boxes, dtype=torch.float32), (width, height), mode="xyxy") + target.add_field('labels', torch.tensor(labels)) + target = target.clip_to_image(remove_empty=True) + + if self.transforms is not None: + img, target = self.transforms(img, target) + + return img, target, index + + def get_annotation(self, image_id): + ann = self.annotations[image_id] + return ann + + def __len__(self): + return len(self.images) + + def get_img_info(self, index): + image = self.images[index] + return {"height": image['height'], "width": image['width'], "id": image['id'], 'file_name': image['file_name']} + + +# -------------------------------------------- +# ----------------Train dataset--------------- +# -------------------------------------------- +class RPCDataset(torch.utils.data.Dataset): + def __init__(self, + images_dir, + ann_file, + use_density_map=False, + rendered=False, + transforms=None): + self.images_dir = images_dir + self.ann_file = ann_file + self.use_density_map = use_density_map + self.rendered = rendered + self.transforms = transforms + self.density_categories = 1 + self.density_map_stride = 1.0 / 8 + self.density_min_sigma = 1.0 + + self.scale = 1.0 + self.ext = '.jpg' + self.image_size = 1815 + + if self.rendered: # Rendered image is 800*800 and format is png + self.scale = 800.0 / 1815.0 + self.ext = '.png' + self.image_size = 800 + + with open(self.ann_file) as fid: + self.annotations = json.load(fid) + + def __getitem__(self, index): + ann = self.annotations[index] + image_id = ann['image_id'] + image_name = os.path.splitext(image_id)[0] + img_path = os.path.join(self.images_dir, image_name + self.ext) + img = Image.open(img_path).convert("RGB") + width, height = img.size[0], img.size[1] + boxes = [] + labels = [] + objects = ann['objects'] + for item in objects: + category = item['category_id'] + x, y, w, h = item['bbox'] + boxes.append([x * self.scale, y * self.scale, (x + w) * self.scale, (y + h) * self.scale]) + labels.append(category) + + target = BoxList(torch.tensor(boxes, dtype=torch.float32), (width, height), mode="xyxy") + target.add_field('labels', torch.tensor(labels)) + + if self.use_density_map: + image_size = self.image_size + size = int(self.density_map_stride * 800) + num_classes = self.density_categories + assert img.width == image_size + assert img.height == image_size + super_categories = [rpc_category_to_super_category(category, num_classes) for category in labels] + density_map = generate_density_map(super_categories, boxes, + scale=size / image_size, + size=size, num_classes=num_classes, + min_sigma=self.density_min_sigma) + target.add_field('heatmap', Heatmap(torch.from_numpy(density_map))) + + target = target.clip_to_image(remove_empty=True) + if self.transforms is not None: + img, target = self.transforms(img, target) + + return img, target, index + + def __len__(self): + return len(self.annotations) + + def get_img_info(self, index): + image_size = 800 if self.rendered else 1815 + return {"height": image_size, "width": image_size} + + +class RPCPseudoDataset(torch.utils.data.Dataset): + + def __init__(self, images_dir, ann_file=None, use_density_map=False, annotations=None, transforms=None): + self.images_dir = images_dir + self.ann_file = ann_file + self.use_density_map = use_density_map + self.transforms = transforms + self.density_categories = 1 + self.density_map_stride = 1.0 / 8 + self.density_min_sigma = 1.0 + + if annotations is not None: + self.annotations = annotations + else: + with open(self.ann_file) as fid: + annotations = json.load(fid) + self.annotations = annotations + + print('Valid annotations: {}'.format(len(self.annotations))) + + def __getitem__(self, index): + ann = self.annotations[index] + img_path = os.path.join(self.images_dir, ann['file_name']) + img = Image.open(img_path).convert("RGB") + width, height = img.size[0], img.size[1] + boxes = [] + labels = [] + for category, x, y, w, h in ann['bbox']: + boxes.append([x, y, x + w, y + h]) + labels.append(category) + + target = BoxList(torch.tensor(boxes, dtype=torch.float32), (width, height), mode="xyxy") + target.add_field('labels', torch.tensor(labels)) + target = target.clip_to_image(remove_empty=True) + if self.use_density_map: + size = int(800 * self.density_map_stride) + image_size = img.width # Test images are squares, except 20180824-14-36-38-430.jpg(1860x1859) + num_classes = self.density_categories + super_categories = [rpc_category_to_super_category(category, self.density_categories) for category in labels] + density_map = generate_density_map(super_categories, boxes, + scale=size / image_size, + size=size, + num_classes=num_classes, + min_sigma=self.density_min_sigma) + target.add_field('heatmap', Heatmap(torch.from_numpy(density_map))) + + if self.transforms is not None: + img, target = self.transforms(img, target) + + return img, target, index + + def __len__(self): + return len(self.annotations) + + def get_img_info(self, index): + ann = self.annotations[index] + return {"height": ann['height'], "width": ann['width'], "id": ann['id'], 'file_name': ann['file_name']} + + +class RPCInstanceSelectDataset(torch.utils.data.Dataset): + + def __init__(self, images_dir, ann_file, transforms=None): + self.images_dir = images_dir + self.ann_file = ann_file + self.transforms = transforms + self.images_dir = images_dir + self.threshold = 0.95 + + with open(self.ann_file) as fid: + annotations = json.load(fid) + + delete_keys = [] + total_objects = 0 + filtered_objects = 0 + annotation_dict = defaultdict(list) + for annotation in annotations: + annotation_dict[annotation['image_id']].append(annotation) + for image_id in annotation_dict: + count = 0 + for obj in annotation_dict[image_id]: + total_objects += 1 + if obj['score'] > self.threshold: + filtered_objects += 1 + count += 1 + if count == 0: + delete_keys.append(image_id) + + with open('/data7/lufficc/rpc/instances_test2019.json') as fid: + data = json.load(fid) + + images = [] + for image in data['images']: + if image['id'] not in delete_keys: + images.append(image) + + for image_id in delete_keys: + del annotation_dict[image_id] + + self.annotations = dict(annotation_dict) + self.images = images + assert len(self.images) == len(self.annotations) + + print('Valid annotations: {}'.format(len(self.annotations))) + print('Ratio: {:.3f}({}/{})'.format(filtered_objects / total_objects, filtered_objects, total_objects)) + + def __getitem__(self, index): + ann = self.annotations[self.images[index]['id']] + img_path = os.path.join(self.images_dir, self.images[index]['file_name']) + img = Image.open(img_path).convert("RGB") + width, height = img.size[0], img.size[1] + boxes = [] + labels = [] + viz = False + for obj in ann: + if obj['score'] > self.threshold: + category = obj['category_id'] + x, y, w, h = obj['bbox'] + boxes.append([x, y, x + w, y + h]) + labels.append(category) + else: + x, y, w, h = [round(k) for k in obj['bbox']] + img = np.array(img) + img[y:y + h, x:x + w, :] = (164, 166, 164) + img = Image.fromarray(img, mode='RGB') + if viz: + import matplotlib.pyplot as plt + plt.imshow(img) + plt.show() + quit() + + target = BoxList(torch.tensor(boxes, dtype=torch.float32), (width, height), mode="xyxy") + target.add_field('labels', torch.tensor(labels)) + target = target.clip_to_image(remove_empty=True) + + if self.transforms is not None: + img, target = self.transforms(img, target) + + return img, target, index + + def __len__(self): + return len(self.images) + + def get_img_info(self, index): + ann = self.images[index] + return ann + + +class ImagesDataset(torch.utils.data.Dataset): + def __init__(self, transforms=None): + self.folder = '/data7/lufficc/rpc/train2019/' + self.paths = glob.glob(os.path.join(self.folder, '*.jpg')) + random.shuffle(self.paths) + self.transforms = transforms + + def __getitem__(self, index): + path = self.paths[index] + img = Image.open(path).convert('RGB') + width, height = img.size[0], img.size[1] + boxes = np.zeros([0, 4], dtype=np.float32) + target = BoxList(torch.tensor(boxes, dtype=torch.float32), (width, height), mode="xyxy") + if self.transforms: + img, _ = self.transforms(img, target) + + return img, target, index + + def __len__(self): + return len(self.paths) diff --git a/maskrcnn_benchmark/data/datasets/voc.py b/maskrcnn_benchmark/data/datasets/voc.py new file mode 100644 index 0000000000000000000000000000000000000000..459985bd12a47ffe5a246cbf8e00b7930b991a1c --- /dev/null +++ b/maskrcnn_benchmark/data/datasets/voc.py @@ -0,0 +1,134 @@ +import os + +import torch +import torch.utils.data +from PIL import Image +import sys + +if sys.version_info[0] == 2: + import xml.etree.cElementTree as ET +else: + import xml.etree.ElementTree as ET + + +from maskrcnn_benchmark.structures.bounding_box import BoxList + + +class PascalVOCDataset(torch.utils.data.Dataset): + + CLASSES = ( + "__background__ ", + "aeroplane", + "bicycle", + "bird", + "boat", + "bottle", + "bus", + "car", + "cat", + "chair", + "cow", + "diningtable", + "dog", + "horse", + "motorbike", + "person", + "pottedplant", + "sheep", + "sofa", + "train", + "tvmonitor", + ) + + def __init__(self, data_dir, split, use_difficult=False, transforms=None): + self.root = data_dir + self.image_set = split + self.keep_difficult = use_difficult + self.transforms = transforms + + self._annopath = os.path.join(self.root, "Annotations", "%s.xml") + self._imgpath = os.path.join(self.root, "JPEGImages", "%s.jpg") + self._imgsetpath = os.path.join(self.root, "ImageSets", "Main", "%s.txt") + + with open(self._imgsetpath % self.image_set) as f: + self.ids = f.readlines() + self.ids = [x.strip("\n") for x in self.ids] + self.id_to_img_map = {k: v for k, v in enumerate(self.ids)} + + cls = PascalVOCDataset.CLASSES + self.class_to_ind = dict(zip(cls, range(len(cls)))) + + def __getitem__(self, index): + img_id = self.ids[index] + img = Image.open(self._imgpath % img_id).convert("RGB") + + target = self.get_groundtruth(index) + target = target.clip_to_image(remove_empty=True) + + if self.transforms is not None: + img, target = self.transforms(img, target) + + return img, target, index + + def __len__(self): + return len(self.ids) + + def get_groundtruth(self, index): + img_id = self.ids[index] + anno = ET.parse(self._annopath % img_id).getroot() + anno = self._preprocess_annotation(anno) + + height, width = anno["im_info"] + target = BoxList(anno["boxes"], (width, height), mode="xyxy") + target.add_field("labels", anno["labels"]) + target.add_field("difficult", anno["difficult"]) + return target + + def _preprocess_annotation(self, target): + boxes = [] + gt_classes = [] + difficult_boxes = [] + TO_REMOVE = 1 + + for obj in target.iter("object"): + difficult = int(obj.find("difficult").text) == 1 + if not self.keep_difficult and difficult: + continue + name = obj.find("name").text.lower().strip() + bb = obj.find("bndbox") + # Make pixel indexes 0-based + # Refer to "https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/datasets/pascal_voc.py#L208-L211" + box = [ + bb.find("xmin").text, + bb.find("ymin").text, + bb.find("xmax").text, + bb.find("ymax").text, + ] + bndbox = tuple( + map(lambda x: x - TO_REMOVE, list(map(int, box))) + ) + + boxes.append(bndbox) + gt_classes.append(self.class_to_ind[name]) + difficult_boxes.append(difficult) + + size = target.find("size") + im_info = tuple(map(int, (size.find("height").text, size.find("width").text))) + + res = { + "boxes": torch.tensor(boxes, dtype=torch.float32), + "labels": torch.tensor(gt_classes), + "difficult": torch.tensor(difficult_boxes), + "im_info": im_info, + } + return res + + def get_img_info(self, index): + img_id = self.ids[index] + anno = ET.parse(self._annopath % img_id).getroot() + size = anno.find("size") + im_info = tuple(map(int, (size.find("height").text, size.find("width").text))) + return {"height": im_info[0], "width": im_info[1]} + + def map_class_id_to_class_name(self, class_id): + return PascalVOCDataset.CLASSES[class_id] diff --git a/maskrcnn_benchmark/data/samplers/__init__.py b/maskrcnn_benchmark/data/samplers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..27982cbe68c6173a911e700273f25973acbf04bd --- /dev/null +++ b/maskrcnn_benchmark/data/samplers/__init__.py @@ -0,0 +1,6 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +from .distributed import DistributedSampler +from .grouped_batch_sampler import GroupedBatchSampler +from .iteration_based_batch_sampler import IterationBasedBatchSampler + +__all__ = ["DistributedSampler", "GroupedBatchSampler", "IterationBasedBatchSampler"] diff --git a/maskrcnn_benchmark/data/samplers/distributed.py b/maskrcnn_benchmark/data/samplers/distributed.py new file mode 100644 index 0000000000000000000000000000000000000000..27a280f9ac767e299f996c8c0e1ba4c37a4f2759 --- /dev/null +++ b/maskrcnn_benchmark/data/samplers/distributed.py @@ -0,0 +1,66 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +# Code is copy-pasted exactly as in torch.utils.data.distributed. +# FIXME remove this once c10d fixes the bug it has +import math +import torch +import torch.distributed as dist +from torch.utils.data.sampler import Sampler + + +class DistributedSampler(Sampler): + """Sampler that restricts data loading to a subset of the dataset. + It is especially useful in conjunction with + :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each + process can pass a DistributedSampler instance as a DataLoader sampler, + and load a subset of the original dataset that is exclusive to it. + .. note:: + Dataset is assumed to be of constant size. + Arguments: + dataset: Dataset used for sampling. + num_replicas (optional): Number of processes participating in + distributed training. + rank (optional): Rank of the current process within num_replicas. + """ + + def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True): + if num_replicas is None: + if not dist.is_available(): + raise RuntimeError("Requires distributed package to be available") + num_replicas = dist.get_world_size() + if rank is None: + if not dist.is_available(): + raise RuntimeError("Requires distributed package to be available") + rank = dist.get_rank() + self.dataset = dataset + self.num_replicas = num_replicas + self.rank = rank + self.epoch = 0 + self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas)) + self.total_size = self.num_samples * self.num_replicas + self.shuffle = shuffle + + def __iter__(self): + if self.shuffle: + # deterministically shuffle based on epoch + g = torch.Generator() + g.manual_seed(self.epoch) + indices = torch.randperm(len(self.dataset), generator=g).tolist() + else: + indices = torch.arange(len(self.dataset)).tolist() + + # add extra samples to make it evenly divisible + indices += indices[: (self.total_size - len(indices))] + assert len(indices) == self.total_size + + # subsample + offset = self.num_samples * self.rank + indices = indices[offset : offset + self.num_samples] + assert len(indices) == self.num_samples + + return iter(indices) + + def __len__(self): + return self.num_samples + + def set_epoch(self, epoch): + self.epoch = epoch diff --git a/maskrcnn_benchmark/data/samplers/grouped_batch_sampler.py b/maskrcnn_benchmark/data/samplers/grouped_batch_sampler.py new file mode 100644 index 0000000000000000000000000000000000000000..d72e2f0265e1016e7bbac67590075fda2bc28a55 --- /dev/null +++ b/maskrcnn_benchmark/data/samplers/grouped_batch_sampler.py @@ -0,0 +1,115 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import itertools + +import torch +from torch.utils.data.sampler import BatchSampler +from torch.utils.data.sampler import Sampler + + +class GroupedBatchSampler(BatchSampler): + """ + Wraps another sampler to yield a mini-batch of indices. + It enforces that elements from the same group should appear in groups of batch_size. + It also tries to provide mini-batches which follows an ordering which is + as close as possible to the ordering from the original sampler. + + Arguments: + sampler (Sampler): Base sampler. + batch_size (int): Size of mini-batch. + drop_uneven (bool): If ``True``, the sampler will drop the batches whose + size is less than ``batch_size`` + + """ + + def __init__(self, sampler, group_ids, batch_size, drop_uneven=False): + if not isinstance(sampler, Sampler): + raise ValueError( + "sampler should be an instance of " + "torch.utils.data.Sampler, but got sampler={}".format(sampler) + ) + self.sampler = sampler + self.group_ids = torch.as_tensor(group_ids) + assert self.group_ids.dim() == 1 + self.batch_size = batch_size + self.drop_uneven = drop_uneven + + self.groups = torch.unique(self.group_ids).sort(0)[0] + + self._can_reuse_batches = False + + def _prepare_batches(self): + dataset_size = len(self.group_ids) + # get the sampled indices from the sampler + sampled_ids = torch.as_tensor(list(self.sampler)) + # potentially not all elements of the dataset were sampled + # by the sampler (e.g., DistributedSampler). + # construct a tensor which contains -1 if the element was + # not sampled, and a non-negative number indicating the + # order where the element was sampled. + # for example. if sampled_ids = [3, 1] and dataset_size = 5, + # the order is [-1, 1, -1, 0, -1] + order = torch.full((dataset_size,), -1, dtype=torch.int64) + order[sampled_ids] = torch.arange(len(sampled_ids)) + + # get a mask with the elements that were sampled + mask = order >= 0 + + # find the elements that belong to each individual cluster + clusters = [(self.group_ids == i) & mask for i in self.groups] + # get relative order of the elements inside each cluster + # that follows the order from the sampler + relative_order = [order[cluster] for cluster in clusters] + # with the relative order, find the absolute order in the + # sampled space + permutation_ids = [s[s.sort()[1]] for s in relative_order] + # permute each cluster so that they follow the order from + # the sampler + permuted_clusters = [sampled_ids[idx] for idx in permutation_ids] + + # splits each cluster in batch_size, and merge as a list of tensors + splits = [c.split(self.batch_size) for c in permuted_clusters] + merged = tuple(itertools.chain.from_iterable(splits)) + + # now each batch internally has the right order, but + # they are grouped by clusters. Find the permutation between + # different batches that brings them as close as possible to + # the order that we have in the sampler. For that, we will consider the + # ordering as coming from the first element of each batch, and sort + # correspondingly + first_element_of_batch = [t[0].item() for t in merged] + # get and inverse mapping from sampled indices and the position where + # they occur (as returned by the sampler) + inv_sampled_ids_map = {v: k for k, v in enumerate(sampled_ids.tolist())} + # from the first element in each batch, get a relative ordering + first_index_of_batch = torch.as_tensor( + [inv_sampled_ids_map[s] for s in first_element_of_batch] + ) + + # permute the batches so that they approximately follow the order + # from the sampler + permutation_order = first_index_of_batch.sort(0)[1].tolist() + # finally, permute the batches + batches = [merged[i].tolist() for i in permutation_order] + + if self.drop_uneven: + kept = [] + for batch in batches: + if len(batch) == self.batch_size: + kept.append(batch) + batches = kept + return batches + + def __iter__(self): + if self._can_reuse_batches: + batches = self._batches + self._can_reuse_batches = False + else: + batches = self._prepare_batches() + self._batches = batches + return iter(batches) + + def __len__(self): + if not hasattr(self, "_batches"): + self._batches = self._prepare_batches() + self._can_reuse_batches = True + return len(self._batches) diff --git a/maskrcnn_benchmark/data/samplers/iteration_based_batch_sampler.py b/maskrcnn_benchmark/data/samplers/iteration_based_batch_sampler.py new file mode 100644 index 0000000000000000000000000000000000000000..93452b64696dc9b2cd2a347b8051729864bf9510 --- /dev/null +++ b/maskrcnn_benchmark/data/samplers/iteration_based_batch_sampler.py @@ -0,0 +1,31 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +from torch.utils.data.sampler import BatchSampler + + +class IterationBasedBatchSampler(BatchSampler): + """ + Wraps a BatchSampler, resampling from it until + a specified number of iterations have been sampled + """ + + def __init__(self, batch_sampler, num_iterations, start_iter=0): + self.batch_sampler = batch_sampler + self.num_iterations = num_iterations + self.start_iter = start_iter + + def __iter__(self): + iteration = self.start_iter + while iteration <= self.num_iterations: + # if the underlying sampler has a set_epoch method, like + # DistributedSampler, used for making each process see + # a different split of the dataset, then set it + if hasattr(self.batch_sampler.sampler, "set_epoch"): + self.batch_sampler.sampler.set_epoch(iteration) + for batch in self.batch_sampler: + iteration += 1 + if iteration > self.num_iterations: + break + yield batch + + def __len__(self): + return self.num_iterations diff --git a/maskrcnn_benchmark/data/transforms/__init__.py b/maskrcnn_benchmark/data/transforms/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..000129717bab774e300464745aa5cbf747f82749 --- /dev/null +++ b/maskrcnn_benchmark/data/transforms/__init__.py @@ -0,0 +1,8 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +from .transforms import Compose +from .transforms import Resize +from .transforms import RandomHorizontalFlip +from .transforms import ToTensor +from .transforms import Normalize + +from .build import build_transforms diff --git a/maskrcnn_benchmark/data/transforms/build.py b/maskrcnn_benchmark/data/transforms/build.py new file mode 100644 index 0000000000000000000000000000000000000000..8645d4df4d230e05728121577c5091c76872f350 --- /dev/null +++ b/maskrcnn_benchmark/data/transforms/build.py @@ -0,0 +1,28 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +from . import transforms as T + + +def build_transforms(cfg, is_train=True): + if is_train: + min_size = cfg.INPUT.MIN_SIZE_TRAIN + max_size = cfg.INPUT.MAX_SIZE_TRAIN + flip_prob = 0.5 # cfg.INPUT.FLIP_PROB_TRAIN + else: + min_size = cfg.INPUT.MIN_SIZE_TEST + max_size = cfg.INPUT.MAX_SIZE_TEST + flip_prob = 0 + + to_bgr255 = cfg.INPUT.TO_BGR255 + normalize_transform = T.Normalize( + mean=cfg.INPUT.PIXEL_MEAN, std=cfg.INPUT.PIXEL_STD, to_bgr255=to_bgr255 + ) + + transform = T.Compose( + [ + T.Resize(min_size, max_size), + T.RandomHorizontalFlip(flip_prob), + T.ToTensor(), + normalize_transform, + ] + ) + return transform diff --git a/maskrcnn_benchmark/data/transforms/transforms.py b/maskrcnn_benchmark/data/transforms/transforms.py new file mode 100644 index 0000000000000000000000000000000000000000..7e3ebbd6c0f77ca46e3b410a66c36c6f05e13661 --- /dev/null +++ b/maskrcnn_benchmark/data/transforms/transforms.py @@ -0,0 +1,90 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import random + +import torch +import torchvision +from torchvision.transforms import functional as F + + +class Compose(object): + def __init__(self, transforms): + self.transforms = transforms + + def __call__(self, image, target): + for t in self.transforms: + image, target = t(image, target) + return image, target + + def __repr__(self): + format_string = self.__class__.__name__ + "(" + for t in self.transforms: + format_string += "\n" + format_string += " {0}".format(t) + format_string += "\n)" + return format_string + + +class Resize(object): + def __init__(self, min_size, max_size): + if not isinstance(min_size, (list, tuple)): + min_size = (min_size,) + self.min_size = min_size + self.max_size = max_size + + # modified from torchvision to add support for max size + def get_size(self, image_size): + w, h = image_size + size = random.choice(self.min_size) + max_size = self.max_size + if max_size is not None: + min_original_size = float(min((w, h))) + max_original_size = float(max((w, h))) + if max_original_size / min_original_size * size > max_size: + size = int(round(max_size * min_original_size / max_original_size)) + + if (w <= h and w == size) or (h <= w and h == size): + return (h, w) + + if w < h: + ow = size + oh = int(size * h / w) + else: + oh = size + ow = int(size * w / h) + + return (oh, ow) + + def __call__(self, image, target): + size = self.get_size(image.size) + image = F.resize(image, size) + target = target.resize(image.size) + return image, target + + +class RandomHorizontalFlip(object): + def __init__(self, prob=0.5): + self.prob = prob + + def __call__(self, image, target): + if random.random() < self.prob: + image = F.hflip(image) + target = target.transpose(0) + return image, target + + +class ToTensor(object): + def __call__(self, image, target): + return F.to_tensor(image), target + + +class Normalize(object): + def __init__(self, mean, std, to_bgr255=True): + self.mean = mean + self.std = std + self.to_bgr255 = to_bgr255 + + def __call__(self, image, target): + if self.to_bgr255: + image = image[[2, 1, 0]] * 255 + image = F.normalize(image, mean=self.mean, std=self.std) + return image, target diff --git a/maskrcnn_benchmark/engine/__init__.py b/maskrcnn_benchmark/engine/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..5c7f19c6c00a4ac3f2f2bc66f892e44bcbd72612 --- /dev/null +++ b/maskrcnn_benchmark/engine/__init__.py @@ -0,0 +1 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. diff --git a/maskrcnn_benchmark/engine/inference.py b/maskrcnn_benchmark/engine/inference.py new file mode 100644 index 0000000000000000000000000000000000000000..16451de568b568e1d4fd24f4675856181de3662e --- /dev/null +++ b/maskrcnn_benchmark/engine/inference.py @@ -0,0 +1,119 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import logging +import time +import os + +import torch +from tqdm import tqdm + +from maskrcnn_benchmark.data.datasets.evaluation import evaluate +from ..utils.comm import is_main_process, get_world_size +from ..utils.comm import all_gather +from ..utils.comm import synchronize +from ..utils.timer import Timer, get_time_str + + +def compute_on_dataset(model, data_loader, device, timer=None): + model.eval() + results_dict = {} + cpu_device = torch.device("cpu") + for _, batch in enumerate(tqdm(data_loader)): + images, targets, image_ids = batch + images = images.to(device) + with torch.no_grad(): + if timer: + timer.tic() + output = model(images) + if timer: + torch.cuda.synchronize() + timer.toc() + output = [o.to(cpu_device) for o in output] + results_dict.update( + {img_id: result for img_id, result in zip(image_ids, output)} + ) + return results_dict + + +def _accumulate_predictions_from_multiple_gpus(predictions_per_gpu): + all_predictions = all_gather(predictions_per_gpu) + if not is_main_process(): + return + # merge the list of dicts + predictions = {} + for p in all_predictions: + predictions.update(p) + # convert a dict where the key is the index in a list + image_ids = list(sorted(predictions.keys())) + if len(image_ids) != image_ids[-1] + 1: + logger = logging.getLogger("maskrcnn_benchmark.inference") + logger.warning( + "Number of images that were gathered from multiple processes is not " + "a contiguous set. Some images might be missing from the evaluation" + ) + + # convert to a list + predictions = [predictions[i] for i in image_ids] + return predictions + + +def inference( + model, + data_loader, + dataset_name, + iou_types=("bbox",), + box_only=False, + generate_pseudo_labels=False, + device="cuda", + expected_results=(), + expected_results_sigma_tol=4, + output_folder=None, + **kwargs +): + # convert to a torch.device for efficiency + device = torch.device(device) + num_devices = get_world_size() + logger = logging.getLogger("maskrcnn_benchmark.inference") + dataset = data_loader.dataset + logger.info("Start evaluation on {} dataset({} images).".format(dataset_name, len(dataset))) + total_timer = Timer() + inference_timer = Timer() + total_timer.tic() + predictions = compute_on_dataset(model, data_loader, device, inference_timer) + # wait for all processes to complete before measuring the time + synchronize() + total_time = total_timer.toc() + total_time_str = get_time_str(total_time) + logger.info( + "Total run time: {} ({} s / img per device, on {} devices)".format( + total_time_str, total_time * num_devices / len(dataset), num_devices + ) + ) + total_infer_time = get_time_str(inference_timer.total_time) + logger.info( + "Model inference time: {} ({} s / img per device, on {} devices)".format( + total_infer_time, + inference_timer.total_time * num_devices / len(dataset), + num_devices, + ) + ) + + predictions = _accumulate_predictions_from_multiple_gpus(predictions) + if not is_main_process(): + return + + if output_folder: + torch.save(predictions, os.path.join(output_folder, "predictions.pth")) + + extra_args = dict( + box_only=box_only, + iou_types=iou_types, + expected_results=expected_results, + expected_results_sigma_tol=expected_results_sigma_tol, + generate_pseudo_labels=generate_pseudo_labels, + **kwargs, + ) + + return evaluate(dataset=dataset, + predictions=predictions, + output_folder=output_folder, + **extra_args) diff --git a/maskrcnn_benchmark/engine/inference_bak.py b/maskrcnn_benchmark/engine/inference_bak.py new file mode 100644 index 0000000000000000000000000000000000000000..d08d3e3930299eaa552523b704f913d2e0b287c0 --- /dev/null +++ b/maskrcnn_benchmark/engine/inference_bak.py @@ -0,0 +1,120 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import logging +import time +import os + +import torch +from tqdm import tqdm + +from maskrcnn_benchmark.data.datasets.evaluation import evaluate +from ..utils.comm import is_main_process, get_world_size +from ..utils.comm import all_gather +from ..utils.comm import synchronize +from ..utils.timer import Timer, get_time_str + + +def compute_on_dataset(model, data_loader, device, timer=None): + model.eval() + results_dict = {} + cpu_device = torch.device("cpu") + for _, batch in enumerate(tqdm(data_loader)): + images, targets, image_ids = batch + images = images.to(device) + with torch.no_grad(): + if timer: + timer.tic() + output = model(images) + if timer: + torch.cuda.synchronize() + timer.toc() + output = [o.to(cpu_device) for o in output] + results_dict.update( + {img_id: result for img_id, result in zip(image_ids, output)} + ) + return results_dict + + +def _accumulate_predictions_from_multiple_gpus(predictions_per_gpu): + all_predictions = all_gather(predictions_per_gpu) + if not is_main_process(): + return + # merge the list of dicts + predictions = {} + for p in all_predictions: + predictions.update(p) + # convert a dict where the key is the index in a list + image_ids = list(sorted(predictions.keys())) + if len(image_ids) != image_ids[-1] + 1: + logger = logging.getLogger("maskrcnn_benchmark.inference") + logger.warning( + "Number of images that were gathered from multiple processes is not " + "a contiguous set. Some images might be missing from the evaluation" + ) + + # convert to a list + predictions = [predictions[i] for i in image_ids] + return predictions + + +def inference( + model, + data_loader, + dataset_name, + iou_types=("bbox",), + box_only=False, + generate_pseudo_labels=False, + device="cuda", + expected_results=(), + expected_results_sigma_tol=4, + output_folder=None, + **kwargs +): + # convert to a torch.device for efficiency + device = torch.device(device) + num_devices = get_world_size() + logger = logging.getLogger("maskrcnn_benchmark.inference") + dataset = data_loader.dataset + logger.info("Start evaluation on {} dataset({} images).".format(dataset_name, len(dataset))) + total_timer = Timer() + inference_timer = Timer() + total_timer.tic() + # predictions = compute_on_dataset(model, data_loader, device, inference_timer) + # wait for all processes to complete before measuring the time + synchronize() + total_time = total_timer.toc() + total_time_str = get_time_str(total_time) + logger.info( + "Total run time: {} ({} s / img per device, on {} devices)".format( + total_time_str, total_time * num_devices / len(dataset), num_devices + ) + ) + total_infer_time = get_time_str(inference_timer.total_time) + logger.info( + "Model inference time: {} ({} s / img per device, on {} devices)".format( + total_infer_time, + inference_timer.total_time * num_devices / len(dataset), + num_devices, + ) + ) + + # predictions = _accumulate_predictions_from_multiple_gpus(predictions) + if not is_main_process(): + return + + # if output_folder: + # torch.save(predictions, os.path.join(output_folder, "predictions.pth")) + predictions = torch.load(os.path.join(output_folder, "predictions.pth")) + + extra_args = dict( + box_only=box_only, + iou_types=iou_types, + expected_results=expected_results, + expected_results_sigma_tol=expected_results_sigma_tol, + generate_pseudo_labels=generate_pseudo_labels, + **kwargs, + ) + + return evaluate(dataset=dataset, + predictions=predictions, + output_folder=output_folder, + **extra_args) diff --git a/maskrcnn_benchmark/engine/trainer.py b/maskrcnn_benchmark/engine/trainer.py new file mode 100644 index 0000000000000000000000000000000000000000..19b909cfa9a53e1ab3a0316294586192834e1641 --- /dev/null +++ b/maskrcnn_benchmark/engine/trainer.py @@ -0,0 +1,355 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import collections +import datetime +import logging +import os +import time + +import torch +import torch.distributed as dist +from torch.utils.data import Subset +import numpy as np +from maskrcnn_benchmark.data.build import build_dataset, make_data_loader +from maskrcnn_benchmark.data.datasets import RPCPseudoDataset, ConcatDataset, RPCTestDataset +from maskrcnn_benchmark.data.transforms import build_transforms +from maskrcnn_benchmark.utils.comm import get_world_size +from maskrcnn_benchmark.utils.imports import import_file +from maskrcnn_benchmark.utils.metric_logger import MetricLogger +from maskrcnn_benchmark.utils.comm import get_rank +from tools.test_net import do_test + + +def reduce_loss_dict(loss_dict): + """ + Reduce the loss dictionary from all processes so that process with rank + 0 has the averaged results. Returns a dict with the same fields as + loss_dict, after reduction. + """ + world_size = get_world_size() + if world_size < 2: + return loss_dict + with torch.no_grad(): + loss_names = [] + all_losses = [] + for k in sorted(loss_dict.keys()): + loss_names.append(k) + all_losses.append(loss_dict[k]) + all_losses = torch.stack(all_losses, dim=0) + dist.reduce(all_losses, dst=0) + if dist.get_rank() == 0: + # only main process gets accumulated, so only divide by + # world_size in this case + all_losses /= world_size + reduced_losses = {k: v for k, v in zip(loss_names, all_losses)} + return reduced_losses + + +def write_metric(eval_result, prefix, summary_writer, global_step): + for key in eval_result: + value = eval_result[key] + tag = '{}/{}'.format(prefix, key) + if isinstance(value, collections.Mapping): + write_metric(value, tag, summary_writer, global_step) + else: + summary_writer.add_scalar(tag, value, global_step=global_step) + + +def do_train( + cfg, + model, + data_loader, + optimizer, + scheduler, + checkpointer, + device, + checkpoint_period, + arguments, + distributed +): + logger = logging.getLogger("maskrcnn_benchmark.trainer") + logger.info("Start training") + meters = MetricLogger(delimiter=" ") + max_iter = len(data_loader) + start_iter = arguments["iteration"] + model.train() + start_training_time = time.time() + end = time.time() + + summary_writer = None + if get_rank() == 0: + import tensorboardX + summary_writer = tensorboardX.SummaryWriter(os.path.join(checkpointer.save_dir, 'tf_logs')) + + for iteration, (images, targets, _) in enumerate(data_loader, start_iter): + data_time = time.time() - end + iteration = iteration + 1 + arguments["iteration"] = iteration + + scheduler.step() + + images = images.to(device) + targets = [target.to(device) for target in targets] + + loss_dict = model(images, targets) + + losses = sum(loss for loss in loss_dict.values()) + + # reduce losses over all GPUs for logging purposes + loss_dict_reduced = reduce_loss_dict(loss_dict) + losses_reduced = sum(loss for loss in loss_dict_reduced.values()) + meters.update(loss=losses_reduced, **loss_dict_reduced) + + optimizer.zero_grad() + losses.backward() + optimizer.step() + + batch_time = time.time() - end + end = time.time() + meters.update(time=batch_time, data=data_time) + + eta_seconds = meters.time.global_avg * (max_iter - iteration) + eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) + + if iteration % 20 == 0 or iteration == max_iter: + if summary_writer: + summary_writer.add_scalar('loss/total_loss', losses_reduced, global_step=iteration) + for name, value in loss_dict_reduced.items(): + summary_writer.add_scalar('loss/%s' % name, value, global_step=iteration) + summary_writer.add_scalar('lr', optimizer.param_groups[0]["lr"], global_step=iteration) + + logger.info( + meters.delimiter.join( + [ + "eta: {eta}", + "iter: {iter}", + "{meters}", + "lr: {lr:.6f}", + "max mem: {memory:.0f}", + ] + ).format( + eta=eta_string, + iter=iteration, + meters=str(meters), + lr=optimizer.param_groups[0]["lr"], + memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0, + ) + ) + if iteration % checkpoint_period == 0: + checkpointer.save("model_{:07d}".format(iteration), **arguments) + if iteration != max_iter: + eval_results = do_test(cfg, model, distributed, iteration=iteration) + if get_rank() == 0 and summary_writer: # only on main thread results are returned. + for eval_result, dataset in zip(eval_results, cfg.DATASETS.TEST): + write_metric(eval_result['metrics'], 'metrics/' + dataset, summary_writer, iteration) + model.train() # *IMPORTANT*: restore train state + if iteration == max_iter: + checkpointer.save("model_final", **arguments) + + total_training_time = time.time() - start_training_time + total_time_str = str(datetime.timedelta(seconds=total_training_time)) + logger.info( + "Total training time: {} ({:.4f} s / it)".format( + total_time_str, total_training_time / (max_iter) + ) + ) + + +def cross_do_train( + cfg, + model, + optimizer, + scheduler, + checkpointer, + device, + checkpoint_period, + arguments, + distributed +): + logger = logging.getLogger("maskrcnn_benchmark.trainer") + logger.info("Start cross training!") + meters = MetricLogger(delimiter=" ") + max_iter = cfg.SOLVER.MAX_ITER + start_iter = arguments["iteration"] + model.train() + # ----------------prepare---------------- + # --------------------------------------- + # --------------------------------------- + is_train = True + paths_catalog = import_file( + "maskrcnn_benchmark.config.paths_catalog", cfg.PATHS_CATALOG, True + ) + DatasetCatalog = paths_catalog.DatasetCatalog + transforms = build_transforms(cfg, is_train=is_train) + dataset_list = cfg.DATASETS.TRAIN + + start_training_time = time.time() + end = time.time() + + summary_writer = None + if get_rank() == 0: + import tensorboardX + summary_writer = tensorboardX.SummaryWriter(os.path.join(checkpointer.save_dir, 'tf_logs')) + + ann_file = cfg.TEST.PSEUDO_LABELS_ANN_FILE + images_dir = cfg.TEST.TEST_IMAGES_DIR + + iteration = start_iter + total_steps = cfg.SOLVER.CROSS_TRAIN_STEPS + for step in range(total_steps): + logger.info('Start training {}th/{} step'.format(step + 1, total_steps)) + iter_per_step = cfg.SOLVER.ITER_PER_STEP + + pseudo_dataset = RPCPseudoDataset(images_dir=images_dir, ann_file=ann_file, use_density_map=True, transforms=transforms) + # --------------------------------------------------------------------------------- + pseudo_dataset.density_categories = cfg.MODEL.DENSITY_HEAD.NUM_CLASSES + pseudo_dataset.density_map_stride = cfg.MODEL.DENSITY_HEAD.FPN_LEVEL_STRIDE + min_sigmas = { + 1: 1.0, + 2: 0.5, + 3: 0.333, + } + min_sigma = min_sigmas[cfg.MODEL.DENSITY_HEAD.FPN_LEVEL] + pseudo_dataset.density_min_sigma = min_sigma + print('using density_min_sigma: {}'.format(min_sigma)) + # --------------------------------------------------------------------------------- + + train_datasets = build_dataset(cfg, dataset_list, transforms, DatasetCatalog, is_train) + + ratio = cfg.SOLVER.CROSS_TRAIN_DATA_RATIO + if ratio > 0: # dynamic source dataset according to pseudo dataset + assert len(train_datasets) == 1 + train_size = len(train_datasets[0]) + indices = np.arange(train_size) + train_size = min(train_size, int(ratio * len(pseudo_dataset))) + indices = np.random.choice(indices, size=train_size, replace=False) + subset_dataset = Subset(train_datasets[0], indices=indices) + train_datasets = [subset_dataset] + elif ratio < 0: # fixed size source dataset + assert len(train_datasets) == 1 + train_size = len(train_datasets[0]) + indices = np.arange(train_size) + train_size = min(train_size, abs(ratio)) + indices = np.random.choice(indices, size=train_size, replace=False) + subset_dataset = Subset(train_datasets[0], indices=indices) + train_datasets = [subset_dataset] + + datasets_s = train_datasets + [pseudo_dataset] + datasets_s = ConcatDataset(datasets_s) + + # logger.info('Subset train dataset: {}'.format(len(subset_dataset))) + logger.info('Pseudo train dataset: {}'.format(len(pseudo_dataset))) + logger.info('Source train dataset: {}'.format(len(datasets_s))) + + # data_loader_s = make_data_loader( + # cfg, + # is_train=is_train, + # is_distributed=distributed, + # start_iter=0, + # datasets=[datasets_s], + # num_iters=iter_per_step + # ) + + data_loader_t = make_data_loader( + cfg, + is_train=is_train, + is_distributed=distributed, + start_iter=0, + datasets=[datasets_s], + num_iters=iter_per_step + ) + + # for (images_s, targets_s, _), (images_t, targets_t, _) in zip(data_loader_s, data_loader_t): + for (images_t, targets_t, _) in data_loader_t: + data_time = time.time() - end + iteration = iteration + 1 + arguments["iteration"] = iteration + + scheduler.step() + + # images_s = images_s.to(device) + # targets_s = [target.to(device) for target in targets_s] + # loss_dict_s = model(images_s, targets_s, is_target_domain=False) + # loss_dict_s = {key + '_s': value for key, value in loss_dict_s.items()} + + images_t = images_t.to(device) + targets_t = [target.to(device) for target in targets_t] + loss_dict = model(images_t, targets_t, is_target_domain=True) + + # loss_dict.update(loss_dict_s) + + losses = sum(loss for loss in loss_dict.values()) + + # reduce losses over all GPUs for logging purposes + loss_dict_reduced = reduce_loss_dict(loss_dict) + losses_reduced = sum(loss for loss in loss_dict_reduced.values()) + meters.update(loss=losses_reduced, **loss_dict_reduced) + + optimizer.zero_grad() + losses.backward() + optimizer.step() + + batch_time = time.time() - end + end = time.time() + meters.update(time=batch_time, data=data_time) + + eta_seconds = meters.time.global_avg * (max_iter - iteration) + eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) + + if iteration % 20 == 0 or iteration == max_iter: + if summary_writer: + summary_writer.add_scalar('loss/total_loss', losses_reduced, global_step=iteration) + for name, value in loss_dict_reduced.items(): + summary_writer.add_scalar('loss/%s' % name, value, global_step=iteration) + summary_writer.add_scalar('lr', optimizer.param_groups[0]["lr"], global_step=iteration) + + logger.info( + meters.delimiter.join( + [ + "eta: {eta}", + "iter: {iter}", + "{meters}", + "lr: {lr:.6f}", + "max mem: {memory:.0f}", + ] + ).format( + eta=eta_string, + iter=iteration, + meters=str(meters), + lr=optimizer.param_groups[0]["lr"], + memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0, + ) + ) + if iteration % checkpoint_period == 0: + checkpointer.save("model_{:07d}".format(iteration), **arguments) + if iteration != max_iter: + eval_results = do_test(cfg, model, distributed, iteration=iteration) + if get_rank() == 0 and summary_writer: # only on main thread results are returned. + for eval_result, dataset in zip(eval_results, cfg.DATASETS.TEST): + write_metric(eval_result['metrics'], 'metrics/' + dataset, summary_writer, iteration) + model.train() # restore train state + + logger.info('Generating new pseudo labels...') + test_dataset = RPCTestDataset(images_dir=cfg.TEST.TEST_IMAGES_DIR, + ann_file=cfg.TEST.TEST_ANN_FILE, + transforms=build_transforms(cfg, is_train=False)) + dataset_name = 'rpc_2019_test' + dataset_names = [dataset_name] + # thresholds = [0.95, 0.97, 0.98, 0.99] + # threshold = thresholds[bisect_right([5, 10, 12], step)] + threshold = 0.95 + eval_results = do_test(cfg, model, distributed, iteration=iteration, generate_pseudo_labels=True, dataset_names=dataset_names, + datasets=[test_dataset], threshold=threshold, use_ground_truth=cfg.TEST.USE_GROUND_TRUTH) + if get_rank() == 0 and summary_writer: # only on main thread results are returned. + for eval_result, dataset in zip(eval_results, dataset_names): + write_metric(eval_result['metrics'], 'metrics/' + dataset, summary_writer, iteration) + model.train() # restore train state + ann_file = os.path.join(cfg.OUTPUT_DIR, "inference", dataset_name, 'pseudo_labeling.json') + + checkpointer.save("model_final", **arguments) + total_training_time = time.time() - start_training_time + total_time_str = str(datetime.timedelta(seconds=total_training_time)) + logger.info( + "Total training time: {} ({:.4f} s / it)".format( + total_time_str, total_training_time / (max_iter) + ) + ) diff --git a/maskrcnn_benchmark/layers/__init__.py b/maskrcnn_benchmark/layers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..bab50abae06fc0a39a17d20a58b10ce49ec409bc --- /dev/null +++ b/maskrcnn_benchmark/layers/__init__.py @@ -0,0 +1,21 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import torch + +from .batch_norm import FrozenBatchNorm2d +from .misc import Conv2d +from .misc import ConvTranspose2d +from .misc import BatchNorm2d +from .misc import interpolate +from .nms import nms +from .roi_align import ROIAlign +from .roi_align import roi_align +from .roi_pool import ROIPool +from .roi_pool import roi_pool +from .smooth_l1_loss import smooth_l1_loss +from .sigmoid_focal_loss import SigmoidFocalLoss + +__all__ = ["nms", "roi_align", "ROIAlign", "roi_pool", "ROIPool", + "smooth_l1_loss", "Conv2d", "ConvTranspose2d", "interpolate", + "BatchNorm2d", "FrozenBatchNorm2d", "SigmoidFocalLoss" + ] + diff --git a/maskrcnn_benchmark/layers/_utils.py b/maskrcnn_benchmark/layers/_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..3dabc127b221d67eae7587ab4905416fa5fcf121 --- /dev/null +++ b/maskrcnn_benchmark/layers/_utils.py @@ -0,0 +1,39 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import glob +import os.path + +import torch + +try: + from torch.utils.cpp_extension import load as load_ext + from torch.utils.cpp_extension import CUDA_HOME +except ImportError: + raise ImportError("The cpp layer extensions requires PyTorch 0.4 or higher") + + +def _load_C_extensions(): + this_dir = os.path.dirname(os.path.abspath(__file__)) + this_dir = os.path.dirname(this_dir) + this_dir = os.path.join(this_dir, "csrc") + + main_file = glob.glob(os.path.join(this_dir, "*.cpp")) + source_cpu = glob.glob(os.path.join(this_dir, "cpu", "*.cpp")) + source_cuda = glob.glob(os.path.join(this_dir, "cuda", "*.cu")) + + source = main_file + source_cpu + + extra_cflags = [] + if torch.cuda.is_available() and CUDA_HOME is not None: + source.extend(source_cuda) + extra_cflags = ["-DWITH_CUDA"] + source = [os.path.join(this_dir, s) for s in source] + extra_include_paths = [this_dir] + return load_ext( + "torchvision", + source, + extra_cflags=extra_cflags, + extra_include_paths=extra_include_paths, + ) + + +_C = _load_C_extensions() diff --git a/maskrcnn_benchmark/layers/batch_norm.py b/maskrcnn_benchmark/layers/batch_norm.py new file mode 100644 index 0000000000000000000000000000000000000000..903607ac3895947d1aa6d6c4766624af0e97bc71 --- /dev/null +++ b/maskrcnn_benchmark/layers/batch_norm.py @@ -0,0 +1,24 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import torch +from torch import nn + + +class FrozenBatchNorm2d(nn.Module): + """ + BatchNorm2d where the batch statistics and the affine parameters + are fixed + """ + + def __init__(self, n): + super(FrozenBatchNorm2d, self).__init__() + self.register_buffer("weight", torch.ones(n)) + self.register_buffer("bias", torch.zeros(n)) + self.register_buffer("running_mean", torch.zeros(n)) + self.register_buffer("running_var", torch.ones(n)) + + def forward(self, x): + scale = self.weight * self.running_var.rsqrt() + bias = self.bias - self.running_mean * scale + scale = scale.reshape(1, -1, 1, 1) + bias = bias.reshape(1, -1, 1, 1) + return x * scale + bias diff --git a/maskrcnn_benchmark/layers/misc.py b/maskrcnn_benchmark/layers/misc.py new file mode 100644 index 0000000000000000000000000000000000000000..a8cf1c680c06b57412bfdf7a1c4a9c53f4acdbbd --- /dev/null +++ b/maskrcnn_benchmark/layers/misc.py @@ -0,0 +1,110 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +""" +helper class that supports empty tensors on some nn functions. + +Ideally, add support directly in PyTorch to empty tensors in +those functions. + +This can be removed once https://github.com/pytorch/pytorch/issues/12013 +is implemented +""" + +import math +import torch +from torch.nn.modules.utils import _ntuple + + +class _NewEmptyTensorOp(torch.autograd.Function): + @staticmethod + def forward(ctx, x, new_shape): + ctx.shape = x.shape + return x.new_empty(new_shape) + + @staticmethod + def backward(ctx, grad): + shape = ctx.shape + return _NewEmptyTensorOp.apply(grad, shape), None + + +class Conv2d(torch.nn.Conv2d): + def forward(self, x): + if x.numel() > 0: + return super(Conv2d, self).forward(x) + # get output shape + + output_shape = [ + (i + 2 * p - (di * (k - 1) + 1)) // d + 1 + for i, p, di, k, d in zip( + x.shape[-2:], self.padding, self.dilation, self.kernel_size, self.stride + ) + ] + output_shape = [x.shape[0], self.weight.shape[0]] + output_shape + return _NewEmptyTensorOp.apply(x, output_shape) + + +class ConvTranspose2d(torch.nn.ConvTranspose2d): + def forward(self, x): + if x.numel() > 0: + return super(ConvTranspose2d, self).forward(x) + # get output shape + + output_shape = [ + (i - 1) * d - 2 * p + (di * (k - 1) + 1) + op + for i, p, di, k, d, op in zip( + x.shape[-2:], + self.padding, + self.dilation, + self.kernel_size, + self.stride, + self.output_padding, + ) + ] + output_shape = [x.shape[0], self.bias.shape[0]] + output_shape + return _NewEmptyTensorOp.apply(x, output_shape) + + +class BatchNorm2d(torch.nn.BatchNorm2d): + def forward(self, x): + if x.numel() > 0: + return super(BatchNorm2d, self).forward(x) + # get output shape + output_shape = x.shape + return _NewEmptyTensorOp.apply(x, output_shape) + + +def interpolate( + input, size=None, scale_factor=None, mode="nearest", align_corners=None +): + if input.numel() > 0: + return torch.nn.functional.interpolate( + input, size, scale_factor, mode, align_corners + ) + + def _check_size_scale_factor(dim): + if size is None and scale_factor is None: + raise ValueError("either size or scale_factor should be defined") + if size is not None and scale_factor is not None: + raise ValueError("only one of size or scale_factor should be defined") + if ( + scale_factor is not None + and isinstance(scale_factor, tuple) + and len(scale_factor) != dim + ): + raise ValueError( + "scale_factor shape must match input shape. " + "Input is {}D, scale_factor size is {}".format(dim, len(scale_factor)) + ) + + def _output_size(dim): + _check_size_scale_factor(dim) + if size is not None: + return size + scale_factors = _ntuple(dim)(scale_factor) + # math.floor might return float in py2.7 + return [ + int(math.floor(input.size(i + 2) * scale_factors[i])) for i in range(dim) + ] + + output_shape = tuple(_output_size(2)) + output_shape = input.shape[:-2] + output_shape + return _NewEmptyTensorOp.apply(input, output_shape) diff --git a/maskrcnn_benchmark/layers/nms.py b/maskrcnn_benchmark/layers/nms.py new file mode 100644 index 0000000000000000000000000000000000000000..1e80b555045d85e509c917f940ee9bc62738fee7 --- /dev/null +++ b/maskrcnn_benchmark/layers/nms.py @@ -0,0 +1,7 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +# from ._utils import _C +from maskrcnn_benchmark import _C + +nms = _C.nms +# nms.__doc__ = """ +# This function performs Non-maximum suppresion""" diff --git a/maskrcnn_benchmark/layers/roi_align.py b/maskrcnn_benchmark/layers/roi_align.py new file mode 100644 index 0000000000000000000000000000000000000000..170c8f18696aed19c4b9533a51933264530a1530 --- /dev/null +++ b/maskrcnn_benchmark/layers/roi_align.py @@ -0,0 +1,68 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import torch +from torch import nn +from torch.autograd import Function +from torch.autograd.function import once_differentiable +from torch.nn.modules.utils import _pair + +from maskrcnn_benchmark import _C + + +class _ROIAlign(Function): + @staticmethod + def forward(ctx, input, roi, output_size, spatial_scale, sampling_ratio): + ctx.save_for_backward(roi) + ctx.output_size = _pair(output_size) + ctx.spatial_scale = spatial_scale + ctx.sampling_ratio = sampling_ratio + ctx.input_shape = input.size() + output = _C.roi_align_forward( + input, roi, spatial_scale, output_size[0], output_size[1], sampling_ratio + ) + return output + + @staticmethod + @once_differentiable + def backward(ctx, grad_output): + rois, = ctx.saved_tensors + output_size = ctx.output_size + spatial_scale = ctx.spatial_scale + sampling_ratio = ctx.sampling_ratio + bs, ch, h, w = ctx.input_shape + grad_input = _C.roi_align_backward( + grad_output, + rois, + spatial_scale, + output_size[0], + output_size[1], + bs, + ch, + h, + w, + sampling_ratio, + ) + return grad_input, None, None, None, None + + +roi_align = _ROIAlign.apply + + +class ROIAlign(nn.Module): + def __init__(self, output_size, spatial_scale, sampling_ratio): + super(ROIAlign, self).__init__() + self.output_size = output_size + self.spatial_scale = spatial_scale + self.sampling_ratio = sampling_ratio + + def forward(self, input, rois): + return roi_align( + input, rois, self.output_size, self.spatial_scale, self.sampling_ratio + ) + + def __repr__(self): + tmpstr = self.__class__.__name__ + "(" + tmpstr += "output_size=" + str(self.output_size) + tmpstr += ", spatial_scale=" + str(self.spatial_scale) + tmpstr += ", sampling_ratio=" + str(self.sampling_ratio) + tmpstr += ")" + return tmpstr diff --git a/maskrcnn_benchmark/layers/roi_pool.py b/maskrcnn_benchmark/layers/roi_pool.py new file mode 100644 index 0000000000000000000000000000000000000000..c0e42756ee6fcd779387255391a30079a28f5e60 --- /dev/null +++ b/maskrcnn_benchmark/layers/roi_pool.py @@ -0,0 +1,63 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import torch +from torch import nn +from torch.autograd import Function +from torch.autograd.function import once_differentiable +from torch.nn.modules.utils import _pair + +from maskrcnn_benchmark import _C + + +class _ROIPool(Function): + @staticmethod + def forward(ctx, input, roi, output_size, spatial_scale): + ctx.output_size = _pair(output_size) + ctx.spatial_scale = spatial_scale + ctx.input_shape = input.size() + output, argmax = _C.roi_pool_forward( + input, roi, spatial_scale, output_size[0], output_size[1] + ) + ctx.save_for_backward(input, roi, argmax) + return output + + @staticmethod + @once_differentiable + def backward(ctx, grad_output): + input, rois, argmax = ctx.saved_tensors + output_size = ctx.output_size + spatial_scale = ctx.spatial_scale + bs, ch, h, w = ctx.input_shape + grad_input = _C.roi_pool_backward( + grad_output, + input, + rois, + argmax, + spatial_scale, + output_size[0], + output_size[1], + bs, + ch, + h, + w, + ) + return grad_input, None, None, None + + +roi_pool = _ROIPool.apply + + +class ROIPool(nn.Module): + def __init__(self, output_size, spatial_scale): + super(ROIPool, self).__init__() + self.output_size = output_size + self.spatial_scale = spatial_scale + + def forward(self, input, rois): + return roi_pool(input, rois, self.output_size, self.spatial_scale) + + def __repr__(self): + tmpstr = self.__class__.__name__ + "(" + tmpstr += "output_size=" + str(self.output_size) + tmpstr += ", spatial_scale=" + str(self.spatial_scale) + tmpstr += ")" + return tmpstr diff --git a/maskrcnn_benchmark/layers/sigmoid_focal_loss.py b/maskrcnn_benchmark/layers/sigmoid_focal_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..c42b4d69900e6222d972ee1296648eae97fec511 --- /dev/null +++ b/maskrcnn_benchmark/layers/sigmoid_focal_loss.py @@ -0,0 +1,76 @@ +import torch +from torch import nn +from torch.autograd import Function +from torch.autograd.function import once_differentiable + +from maskrcnn_benchmark import _C + +# TODO: Use JIT to replace CUDA implementation in the future. +class _SigmoidFocalLoss(Function): + @staticmethod + def forward(ctx, logits, targets, gamma, alpha): + ctx.save_for_backward(logits, targets) + num_classes = logits.shape[1] + ctx.num_classes = num_classes + ctx.gamma = gamma + ctx.alpha = alpha + + losses = _C.sigmoid_focalloss_forward( + logits, targets, num_classes, gamma, alpha + ) + return losses + + @staticmethod + @once_differentiable + def backward(ctx, d_loss): + logits, targets = ctx.saved_tensors + num_classes = ctx.num_classes + gamma = ctx.gamma + alpha = ctx.alpha + d_loss = d_loss.contiguous() + d_logits = _C.sigmoid_focalloss_backward( + logits, targets, d_loss, num_classes, gamma, alpha + ) + return d_logits, None, None, None, None + + +sigmoid_focal_loss_cuda = _SigmoidFocalLoss.apply + + +def sigmoid_focal_loss_cpu(logits, targets, gamma, alpha): + num_classes = logits.shape[1] + gamma = gamma[0] + alpha = alpha[0] + dtype = targets.dtype + device = targets.device + class_range = torch.arange(1, num_classes+1, dtype=dtype, device=device).unsqueeze(0) + + t = targets.unsqueeze(1) + p = torch.sigmoid(logits) + term1 = (1 - p) ** gamma * torch.log(p) + term2 = p ** gamma * torch.log(1 - p) + return -(t == class_range).float() * term1 * alpha - ((t != class_range) * (t >= 0)).float() * term2 * (1 - alpha) + + +class SigmoidFocalLoss(nn.Module): + def __init__(self, gamma, alpha): + super(SigmoidFocalLoss, self).__init__() + self.gamma = gamma + self.alpha = alpha + + def forward(self, logits, targets): + device = logits.device + if logits.is_cuda: + loss_func = sigmoid_focal_loss_cuda + else: + loss_func = sigmoid_focal_loss_cpu + + loss = loss_func(logits, targets, self.gamma, self.alpha) + return loss.sum() + + def __repr__(self): + tmpstr = self.__class__.__name__ + "(" + tmpstr += "gamma=" + str(self.gamma) + tmpstr += ", alpha=" + str(self.alpha) + tmpstr += ")" + return tmpstr diff --git a/maskrcnn_benchmark/layers/smooth_l1_loss.py b/maskrcnn_benchmark/layers/smooth_l1_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..9c4664bb47b731eb087aa777d6f9a4b28fddd03a --- /dev/null +++ b/maskrcnn_benchmark/layers/smooth_l1_loss.py @@ -0,0 +1,16 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import torch + + +# TODO maybe push this to nn? +def smooth_l1_loss(input, target, beta=1. / 9, size_average=True): + """ + very similar to the smooth_l1_loss from pytorch, but with + the extra beta parameter + """ + n = torch.abs(input - target) + cond = n < beta + loss = torch.where(cond, 0.5 * n ** 2 / beta, n - 0.5 * beta) + if size_average: + return loss.mean() + return loss.sum() diff --git a/maskrcnn_benchmark/modeling/__init__.py b/maskrcnn_benchmark/modeling/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/maskrcnn_benchmark/modeling/backbone/__init__.py b/maskrcnn_benchmark/modeling/backbone/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..537ebe56e683f4c665bb9b60fed9a1811645d8e5 --- /dev/null +++ b/maskrcnn_benchmark/modeling/backbone/__init__.py @@ -0,0 +1,3 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +from .backbone import build_backbone +from . import fbnet diff --git a/maskrcnn_benchmark/modeling/backbone/backbone.py b/maskrcnn_benchmark/modeling/backbone/backbone.py new file mode 100644 index 0000000000000000000000000000000000000000..6033d6fe0e2561d09c129952f6e325d0f3fda782 --- /dev/null +++ b/maskrcnn_benchmark/modeling/backbone/backbone.py @@ -0,0 +1,79 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +from collections import OrderedDict + +from torch import nn + +from maskrcnn_benchmark.modeling import registry +from maskrcnn_benchmark.modeling.make_layers import conv_with_kaiming_uniform +from . import fpn as fpn_module +from . import resnet + + +@registry.BACKBONES.register("R-50-C4") +@registry.BACKBONES.register("R-50-C5") +@registry.BACKBONES.register("R-101-C4") +@registry.BACKBONES.register("R-101-C5") +def build_resnet_backbone(cfg): + body = resnet.ResNet(cfg) + model = nn.Sequential(OrderedDict([("body", body)])) + model.out_channels = cfg.MODEL.RESNETS.BACKBONE_OUT_CHANNELS + return model + + +@registry.BACKBONES.register("R-50-FPN") +@registry.BACKBONES.register("R-101-FPN") +@registry.BACKBONES.register("R-152-FPN") +def build_resnet_fpn_backbone(cfg): + body = resnet.ResNet(cfg) + in_channels_stage2 = cfg.MODEL.RESNETS.RES2_OUT_CHANNELS + out_channels = cfg.MODEL.RESNETS.BACKBONE_OUT_CHANNELS + fpn = fpn_module.FPN( + in_channels_list=[ + in_channels_stage2, + in_channels_stage2 * 2, + in_channels_stage2 * 4, + in_channels_stage2 * 8, + ], + out_channels=out_channels, + conv_block=conv_with_kaiming_uniform( + cfg.MODEL.FPN.USE_GN, cfg.MODEL.FPN.USE_RELU + ), + top_blocks=fpn_module.LastLevelMaxPool(), + ) + model = nn.Sequential(OrderedDict([("body", body), ("fpn", fpn)])) + model.out_channels = out_channels + return model + + +@registry.BACKBONES.register("R-50-FPN-RETINANET") +@registry.BACKBONES.register("R-101-FPN-RETINANET") +def build_resnet_fpn_p3p7_backbone(cfg): + body = resnet.ResNet(cfg) + in_channels_stage2 = cfg.MODEL.RESNETS.RES2_OUT_CHANNELS + out_channels = cfg.MODEL.RESNETS.BACKBONE_OUT_CHANNELS + in_channels_p6p7 = in_channels_stage2 * 8 if cfg.MODEL.RETINANET.USE_C5 \ + else out_channels + fpn = fpn_module.FPN( + in_channels_list=[ + 0, + in_channels_stage2 * 2, + in_channels_stage2 * 4, + in_channels_stage2 * 8, + ], + out_channels=out_channels, + conv_block=conv_with_kaiming_uniform( + cfg.MODEL.FPN.USE_GN, cfg.MODEL.FPN.USE_RELU + ), + top_blocks=fpn_module.LastLevelP6P7(in_channels_p6p7, out_channels), + ) + model = nn.Sequential(OrderedDict([("body", body), ("fpn", fpn)])) + model.out_channels = out_channels + return model + + +def build_backbone(cfg): + assert cfg.MODEL.BACKBONE.CONV_BODY in registry.BACKBONES, \ + "cfg.MODEL.BACKBONE.CONV_BODY: {} are not registered in registry".format( + cfg.MODEL.BACKBONE.CONV_BODY + ) + return registry.BACKBONES[cfg.MODEL.BACKBONE.CONV_BODY](cfg) diff --git a/maskrcnn_benchmark/modeling/backbone/fbnet.py b/maskrcnn_benchmark/modeling/backbone/fbnet.py new file mode 100644 index 0000000000000000000000000000000000000000..0d8cf1522f61dd77c4c8617a1555a004509e4352 --- /dev/null +++ b/maskrcnn_benchmark/modeling/backbone/fbnet.py @@ -0,0 +1,252 @@ +from __future__ import absolute_import, division, print_function, unicode_literals + +import copy +import json +import logging +from collections import OrderedDict + +from . import ( + fbnet_builder as mbuilder, + fbnet_modeldef as modeldef, +) +import torch.nn as nn +from maskrcnn_benchmark.modeling import registry +from maskrcnn_benchmark.modeling.rpn import rpn +from maskrcnn_benchmark.modeling import poolers + + +logger = logging.getLogger(__name__) + + +def create_builder(cfg): + bn_type = cfg.MODEL.FBNET.BN_TYPE + if bn_type == "gn": + bn_type = (bn_type, cfg.GROUP_NORM.NUM_GROUPS) + factor = cfg.MODEL.FBNET.SCALE_FACTOR + + arch = cfg.MODEL.FBNET.ARCH + arch_def = cfg.MODEL.FBNET.ARCH_DEF + if len(arch_def) > 0: + arch_def = json.loads(arch_def) + if arch in modeldef.MODEL_ARCH: + if len(arch_def) > 0: + assert ( + arch_def == modeldef.MODEL_ARCH[arch] + ), "Two architectures with the same name {},\n{},\n{}".format( + arch, arch_def, modeldef.MODEL_ARCH[arch] + ) + arch_def = modeldef.MODEL_ARCH[arch] + else: + assert arch_def is not None and len(arch_def) > 0 + arch_def = mbuilder.unify_arch_def(arch_def) + + rpn_stride = arch_def.get("rpn_stride", None) + if rpn_stride is not None: + assert ( + cfg.MODEL.RPN.ANCHOR_STRIDE[0] == rpn_stride + ), "Needs to set cfg.MODEL.RPN.ANCHOR_STRIDE to {}, got {}".format( + rpn_stride, cfg.MODEL.RPN.ANCHOR_STRIDE + ) + width_divisor = cfg.MODEL.FBNET.WIDTH_DIVISOR + dw_skip_bn = cfg.MODEL.FBNET.DW_CONV_SKIP_BN + dw_skip_relu = cfg.MODEL.FBNET.DW_CONV_SKIP_RELU + + logger.info( + "Building fbnet model with arch {} (without scaling):\n{}".format( + arch, arch_def + ) + ) + + builder = mbuilder.FBNetBuilder( + width_ratio=factor, + bn_type=bn_type, + width_divisor=width_divisor, + dw_skip_bn=dw_skip_bn, + dw_skip_relu=dw_skip_relu, + ) + + return builder, arch_def + + +def _get_trunk_cfg(arch_def): + """ Get all stages except the last one """ + num_stages = mbuilder.get_num_stages(arch_def) + trunk_stages = arch_def.get("backbone", range(num_stages - 1)) + ret = mbuilder.get_blocks(arch_def, stage_indices=trunk_stages) + return ret + + +class FBNetTrunk(nn.Module): + def __init__( + self, builder, arch_def, dim_in, + ): + super(FBNetTrunk, self).__init__() + self.first = builder.add_first(arch_def["first"], dim_in=dim_in) + trunk_cfg = _get_trunk_cfg(arch_def) + self.stages = builder.add_blocks(trunk_cfg["stages"]) + + # return features for each stage + def forward(self, x): + y = self.first(x) + y = self.stages(y) + ret = [y] + return ret + + +@registry.BACKBONES.register("FBNet") +def add_conv_body(cfg, dim_in=3): + builder, arch_def = create_builder(cfg) + + body = FBNetTrunk(builder, arch_def, dim_in) + model = nn.Sequential(OrderedDict([("body", body)])) + model.out_channels = builder.last_depth + + return model + + +def _get_rpn_stage(arch_def, num_blocks): + rpn_stage = arch_def.get("rpn") + ret = mbuilder.get_blocks(arch_def, stage_indices=rpn_stage) + if num_blocks > 0: + logger.warn('Use last {} blocks in {} as rpn'.format(num_blocks, ret)) + block_count = len(ret["stages"]) + assert num_blocks <= block_count, "use block {}, block count {}".format( + num_blocks, block_count + ) + blocks = range(block_count - num_blocks, block_count) + ret = mbuilder.get_blocks(ret, block_indices=blocks) + return ret["stages"] + + +class FBNetRPNHead(nn.Module): + def __init__( + self, cfg, in_channels, builder, arch_def, + ): + super(FBNetRPNHead, self).__init__() + assert in_channels == builder.last_depth + + rpn_bn_type = cfg.MODEL.FBNET.RPN_BN_TYPE + if len(rpn_bn_type) > 0: + builder.bn_type = rpn_bn_type + + use_blocks = cfg.MODEL.FBNET.RPN_HEAD_BLOCKS + stages = _get_rpn_stage(arch_def, use_blocks) + + self.head = builder.add_blocks(stages) + self.out_channels = builder.last_depth + + def forward(self, x): + x = [self.head(y) for y in x] + return x + + +@registry.RPN_HEADS.register("FBNet.rpn_head") +def add_rpn_head(cfg, in_channels, num_anchors): + builder, model_arch = create_builder(cfg) + builder.last_depth = in_channels + + assert in_channels == builder.last_depth + # builder.name_prefix = "[rpn]" + + rpn_feature = FBNetRPNHead(cfg, in_channels, builder, model_arch) + rpn_regressor = rpn.RPNHeadConvRegressor( + cfg, rpn_feature.out_channels, num_anchors) + return nn.Sequential(rpn_feature, rpn_regressor) + + +def _get_head_stage(arch, head_name, blocks): + # use default name 'head' if the specific name 'head_name' does not existed + if head_name not in arch: + head_name = "head" + head_stage = arch.get(head_name) + ret = mbuilder.get_blocks(arch, stage_indices=head_stage, block_indices=blocks) + return ret["stages"] + + +# name mapping for head names in arch def and cfg +ARCH_CFG_NAME_MAPPING = { + "bbox": "ROI_BOX_HEAD", + "kpts": "ROI_KEYPOINT_HEAD", + "mask": "ROI_MASK_HEAD", +} + + +class FBNetROIHead(nn.Module): + def __init__( + self, cfg, in_channels, builder, arch_def, + head_name, use_blocks, stride_init, last_layer_scale, + ): + super(FBNetROIHead, self).__init__() + assert in_channels == builder.last_depth + assert isinstance(use_blocks, list) + + head_cfg_name = ARCH_CFG_NAME_MAPPING[head_name] + self.pooler = poolers.make_pooler(cfg, head_cfg_name) + + stage = _get_head_stage(arch_def, head_name, use_blocks) + + assert stride_init in [0, 1, 2] + if stride_init != 0: + stage[0]["block"][3] = stride_init + blocks = builder.add_blocks(stage) + + last_info = copy.deepcopy(arch_def["last"]) + last_info[1] = last_layer_scale + last = builder.add_last(last_info) + + self.head = nn.Sequential(OrderedDict([ + ("blocks", blocks), + ("last", last) + ])) + + self.out_channels = builder.last_depth + + def forward(self, x, proposals): + x = self.pooler(x, proposals) + x = self.head(x) + return x + + +@registry.ROI_BOX_FEATURE_EXTRACTORS.register("FBNet.roi_head") +def add_roi_head(cfg, in_channels): + builder, model_arch = create_builder(cfg) + builder.last_depth = in_channels + # builder.name_prefix = "_[bbox]_" + + return FBNetROIHead( + cfg, in_channels, builder, model_arch, + head_name="bbox", + use_blocks=cfg.MODEL.FBNET.DET_HEAD_BLOCKS, + stride_init=cfg.MODEL.FBNET.DET_HEAD_STRIDE, + last_layer_scale=cfg.MODEL.FBNET.DET_HEAD_LAST_SCALE, + ) + + +@registry.ROI_KEYPOINT_FEATURE_EXTRACTORS.register("FBNet.roi_head_keypoints") +def add_roi_head_keypoints(cfg, in_channels): + builder, model_arch = create_builder(cfg) + builder.last_depth = in_channels + # builder.name_prefix = "_[kpts]_" + + return FBNetROIHead( + cfg, in_channels, builder, model_arch, + head_name="kpts", + use_blocks=cfg.MODEL.FBNET.KPTS_HEAD_BLOCKS, + stride_init=cfg.MODEL.FBNET.KPTS_HEAD_STRIDE, + last_layer_scale=cfg.MODEL.FBNET.KPTS_HEAD_LAST_SCALE, + ) + + +@registry.ROI_MASK_FEATURE_EXTRACTORS.register("FBNet.roi_head_mask") +def add_roi_head_mask(cfg, in_channels): + builder, model_arch = create_builder(cfg) + builder.last_depth = in_channels + # builder.name_prefix = "_[mask]_" + + return FBNetROIHead( + cfg, in_channels, builder, model_arch, + head_name="mask", + use_blocks=cfg.MODEL.FBNET.MASK_HEAD_BLOCKS, + stride_init=cfg.MODEL.FBNET.MASK_HEAD_STRIDE, + last_layer_scale=cfg.MODEL.FBNET.MASK_HEAD_LAST_SCALE, + ) diff --git a/maskrcnn_benchmark/modeling/backbone/fbnet_builder.py b/maskrcnn_benchmark/modeling/backbone/fbnet_builder.py new file mode 100644 index 0000000000000000000000000000000000000000..112a04074c31307d9080e0bf61115f79d4a9e0d4 --- /dev/null +++ b/maskrcnn_benchmark/modeling/backbone/fbnet_builder.py @@ -0,0 +1,829 @@ +""" +FBNet model builder +""" + +from __future__ import absolute_import, division, print_function, unicode_literals + +import copy +import logging +import math +from collections import OrderedDict + +import torch +import torch.nn as nn +from maskrcnn_benchmark.layers import ( + BatchNorm2d, + Conv2d, + FrozenBatchNorm2d, + interpolate, +) +from maskrcnn_benchmark.layers.misc import _NewEmptyTensorOp + + +logger = logging.getLogger(__name__) + + +def _py2_round(x): + return math.floor(x + 0.5) if x >= 0.0 else math.ceil(x - 0.5) + + +def _get_divisible_by(num, divisible_by, min_val): + ret = int(num) + if divisible_by > 0 and num % divisible_by != 0: + ret = int((_py2_round(num / divisible_by) or min_val) * divisible_by) + return ret + + +PRIMITIVES = { + "skip": lambda C_in, C_out, expansion, stride, **kwargs: Identity( + C_in, C_out, stride + ), + "ir_k3": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( + C_in, C_out, expansion, stride, **kwargs + ), + "ir_k5": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( + C_in, C_out, expansion, stride, kernel=5, **kwargs + ), + "ir_k7": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( + C_in, C_out, expansion, stride, kernel=7, **kwargs + ), + "ir_k1": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( + C_in, C_out, expansion, stride, kernel=1, **kwargs + ), + "shuffle": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( + C_in, C_out, expansion, stride, shuffle_type="mid", pw_group=4, **kwargs + ), + "basic_block": lambda C_in, C_out, expansion, stride, **kwargs: CascadeConv3x3( + C_in, C_out, stride + ), + "shift_5x5": lambda C_in, C_out, expansion, stride, **kwargs: ShiftBlock5x5( + C_in, C_out, expansion, stride + ), + # layer search 2 + "ir_k3_e1": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( + C_in, C_out, 1, stride, kernel=3, **kwargs + ), + "ir_k3_e3": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( + C_in, C_out, 3, stride, kernel=3, **kwargs + ), + "ir_k3_e6": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( + C_in, C_out, 6, stride, kernel=3, **kwargs + ), + "ir_k3_s4": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( + C_in, C_out, 4, stride, kernel=3, shuffle_type="mid", pw_group=4, **kwargs + ), + "ir_k5_e1": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( + C_in, C_out, 1, stride, kernel=5, **kwargs + ), + "ir_k5_e3": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( + C_in, C_out, 3, stride, kernel=5, **kwargs + ), + "ir_k5_e6": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( + C_in, C_out, 6, stride, kernel=5, **kwargs + ), + "ir_k5_s4": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( + C_in, C_out, 4, stride, kernel=5, shuffle_type="mid", pw_group=4, **kwargs + ), + # layer search se + "ir_k3_e1_se": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( + C_in, C_out, 1, stride, kernel=3, se=True, **kwargs + ), + "ir_k3_e3_se": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( + C_in, C_out, 3, stride, kernel=3, se=True, **kwargs + ), + "ir_k3_e6_se": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( + C_in, C_out, 6, stride, kernel=3, se=True, **kwargs + ), + "ir_k3_s4_se": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( + C_in, + C_out, + 4, + stride, + kernel=3, + shuffle_type="mid", + pw_group=4, + se=True, + **kwargs + ), + "ir_k5_e1_se": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( + C_in, C_out, 1, stride, kernel=5, se=True, **kwargs + ), + "ir_k5_e3_se": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( + C_in, C_out, 3, stride, kernel=5, se=True, **kwargs + ), + "ir_k5_e6_se": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( + C_in, C_out, 6, stride, kernel=5, se=True, **kwargs + ), + "ir_k5_s4_se": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( + C_in, + C_out, + 4, + stride, + kernel=5, + shuffle_type="mid", + pw_group=4, + se=True, + **kwargs + ), + # layer search 3 (in addition to layer search 2) + "ir_k3_s2": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( + C_in, C_out, 1, stride, kernel=3, shuffle_type="mid", pw_group=2, **kwargs + ), + "ir_k5_s2": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( + C_in, C_out, 1, stride, kernel=5, shuffle_type="mid", pw_group=2, **kwargs + ), + "ir_k3_s2_se": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( + C_in, + C_out, + 1, + stride, + kernel=3, + shuffle_type="mid", + pw_group=2, + se=True, + **kwargs + ), + "ir_k5_s2_se": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( + C_in, + C_out, + 1, + stride, + kernel=5, + shuffle_type="mid", + pw_group=2, + se=True, + **kwargs + ), + # layer search 4 (in addition to layer search 3) + "ir_k3_sep": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( + C_in, C_out, expansion, stride, kernel=3, cdw=True, **kwargs + ), + "ir_k33_e1": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( + C_in, C_out, 1, stride, kernel=3, cdw=True, **kwargs + ), + "ir_k33_e3": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( + C_in, C_out, 3, stride, kernel=3, cdw=True, **kwargs + ), + "ir_k33_e6": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( + C_in, C_out, 6, stride, kernel=3, cdw=True, **kwargs + ), + # layer search 5 (in addition to layer search 4) + "ir_k7_e1": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( + C_in, C_out, 1, stride, kernel=7, **kwargs + ), + "ir_k7_e3": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( + C_in, C_out, 3, stride, kernel=7, **kwargs + ), + "ir_k7_e6": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( + C_in, C_out, 6, stride, kernel=7, **kwargs + ), + "ir_k7_sep": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( + C_in, C_out, expansion, stride, kernel=7, cdw=True, **kwargs + ), + "ir_k7_sep_e1": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( + C_in, C_out, 1, stride, kernel=7, cdw=True, **kwargs + ), + "ir_k7_sep_e3": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( + C_in, C_out, 3, stride, kernel=7, cdw=True, **kwargs + ), + "ir_k7_sep_e6": lambda C_in, C_out, expansion, stride, **kwargs: IRFBlock( + C_in, C_out, 6, stride, kernel=7, cdw=True, **kwargs + ), +} + + +class Identity(nn.Module): + def __init__(self, C_in, C_out, stride): + super(Identity, self).__init__() + self.conv = ( + ConvBNRelu( + C_in, + C_out, + kernel=1, + stride=stride, + pad=0, + no_bias=1, + use_relu="relu", + bn_type="bn", + ) + if C_in != C_out or stride != 1 + else None + ) + + def forward(self, x): + if self.conv: + out = self.conv(x) + else: + out = x + return out + + +class CascadeConv3x3(nn.Sequential): + def __init__(self, C_in, C_out, stride): + assert stride in [1, 2] + ops = [ + Conv2d(C_in, C_in, 3, stride, 1, bias=False), + BatchNorm2d(C_in), + nn.ReLU(inplace=True), + Conv2d(C_in, C_out, 3, 1, 1, bias=False), + BatchNorm2d(C_out), + ] + super(CascadeConv3x3, self).__init__(*ops) + self.res_connect = (stride == 1) and (C_in == C_out) + + def forward(self, x): + y = super(CascadeConv3x3, self).forward(x) + if self.res_connect: + y += x + return y + + +class Shift(nn.Module): + def __init__(self, C, kernel_size, stride, padding): + super(Shift, self).__init__() + self.C = C + kernel = torch.zeros((C, 1, kernel_size, kernel_size), dtype=torch.float32) + ch_idx = 0 + + assert stride in [1, 2] + self.stride = stride + self.padding = padding + self.kernel_size = kernel_size + self.dilation = 1 + + hks = kernel_size // 2 + ksq = kernel_size ** 2 + + for i in range(kernel_size): + for j in range(kernel_size): + if i == hks and j == hks: + num_ch = C // ksq + C % ksq + else: + num_ch = C // ksq + kernel[ch_idx : ch_idx + num_ch, 0, i, j] = 1 + ch_idx += num_ch + + self.register_parameter("bias", None) + self.kernel = nn.Parameter(kernel, requires_grad=False) + + def forward(self, x): + if x.numel() > 0: + return nn.functional.conv2d( + x, + self.kernel, + self.bias, + (self.stride, self.stride), + (self.padding, self.padding), + self.dilation, + self.C, # groups + ) + + output_shape = [ + (i + 2 * p - (di * (k - 1) + 1)) // d + 1 + for i, p, di, k, d in zip( + x.shape[-2:], + (self.padding, self.dilation), + (self.dilation, self.dilation), + (self.kernel_size, self.kernel_size), + (self.stride, self.stride), + ) + ] + output_shape = [x.shape[0], self.C] + output_shape + return _NewEmptyTensorOp.apply(x, output_shape) + + +class ShiftBlock5x5(nn.Sequential): + def __init__(self, C_in, C_out, expansion, stride): + assert stride in [1, 2] + self.res_connect = (stride == 1) and (C_in == C_out) + + C_mid = _get_divisible_by(C_in * expansion, 8, 8) + + ops = [ + # pw + Conv2d(C_in, C_mid, 1, 1, 0, bias=False), + BatchNorm2d(C_mid), + nn.ReLU(inplace=True), + # shift + Shift(C_mid, 5, stride, 2), + # pw-linear + Conv2d(C_mid, C_out, 1, 1, 0, bias=False), + BatchNorm2d(C_out), + ] + super(ShiftBlock5x5, self).__init__(*ops) + + def forward(self, x): + y = super(ShiftBlock5x5, self).forward(x) + if self.res_connect: + y += x + return y + + +class ChannelShuffle(nn.Module): + def __init__(self, groups): + super(ChannelShuffle, self).__init__() + self.groups = groups + + def forward(self, x): + """Channel shuffle: [N,C,H,W] -> [N,g,C/g,H,W] -> [N,C/g,g,H,w] -> [N,C,H,W]""" + N, C, H, W = x.size() + g = self.groups + assert C % g == 0, "Incompatible group size {} for input channel {}".format( + g, C + ) + return ( + x.view(N, g, int(C / g), H, W) + .permute(0, 2, 1, 3, 4) + .contiguous() + .view(N, C, H, W) + ) + + +class ConvBNRelu(nn.Sequential): + def __init__( + self, + input_depth, + output_depth, + kernel, + stride, + pad, + no_bias, + use_relu, + bn_type, + group=1, + *args, + **kwargs + ): + super(ConvBNRelu, self).__init__() + + assert use_relu in ["relu", None] + if isinstance(bn_type, (list, tuple)): + assert len(bn_type) == 2 + assert bn_type[0] == "gn" + gn_group = bn_type[1] + bn_type = bn_type[0] + assert bn_type in ["bn", "af", "gn", None] + assert stride in [1, 2, 4] + + op = Conv2d( + input_depth, + output_depth, + kernel_size=kernel, + stride=stride, + padding=pad, + bias=not no_bias, + groups=group, + *args, + **kwargs + ) + nn.init.kaiming_normal_(op.weight, mode="fan_out", nonlinearity="relu") + if op.bias is not None: + nn.init.constant_(op.bias, 0.0) + self.add_module("conv", op) + + if bn_type == "bn": + bn_op = BatchNorm2d(output_depth) + elif bn_type == "gn": + bn_op = nn.GroupNorm(num_groups=gn_group, num_channels=output_depth) + elif bn_type == "af": + bn_op = FrozenBatchNorm2d(output_depth) + if bn_type is not None: + self.add_module("bn", bn_op) + + if use_relu == "relu": + self.add_module("relu", nn.ReLU(inplace=True)) + + +class SEModule(nn.Module): + reduction = 4 + + def __init__(self, C): + super(SEModule, self).__init__() + mid = max(C // self.reduction, 8) + conv1 = Conv2d(C, mid, 1, 1, 0) + conv2 = Conv2d(mid, C, 1, 1, 0) + + self.op = nn.Sequential( + nn.AdaptiveAvgPool2d(1), conv1, nn.ReLU(inplace=True), conv2, nn.Sigmoid() + ) + + def forward(self, x): + return x * self.op(x) + + +class Upsample(nn.Module): + def __init__(self, scale_factor, mode, align_corners=None): + super(Upsample, self).__init__() + self.scale = scale_factor + self.mode = mode + self.align_corners = align_corners + + def forward(self, x): + return interpolate( + x, scale_factor=self.scale, mode=self.mode, + align_corners=self.align_corners + ) + + +def _get_upsample_op(stride): + assert ( + stride in [1, 2, 4] + or stride in [-1, -2, -4] + or (isinstance(stride, tuple) and all(x in [-1, -2, -4] for x in stride)) + ) + + scales = stride + ret = None + if isinstance(stride, tuple) or stride < 0: + scales = [-x for x in stride] if isinstance(stride, tuple) else -stride + stride = 1 + ret = Upsample(scale_factor=scales, mode="nearest", align_corners=None) + + return ret, stride + + +class IRFBlock(nn.Module): + def __init__( + self, + input_depth, + output_depth, + expansion, + stride, + bn_type="bn", + kernel=3, + width_divisor=1, + shuffle_type=None, + pw_group=1, + se=False, + cdw=False, + dw_skip_bn=False, + dw_skip_relu=False, + ): + super(IRFBlock, self).__init__() + + assert kernel in [1, 3, 5, 7], kernel + + self.use_res_connect = stride == 1 and input_depth == output_depth + self.output_depth = output_depth + + mid_depth = int(input_depth * expansion) + mid_depth = _get_divisible_by(mid_depth, width_divisor, width_divisor) + + # pw + self.pw = ConvBNRelu( + input_depth, + mid_depth, + kernel=1, + stride=1, + pad=0, + no_bias=1, + use_relu="relu", + bn_type=bn_type, + group=pw_group, + ) + + # negative stride to do upsampling + self.upscale, stride = _get_upsample_op(stride) + + # dw + if kernel == 1: + self.dw = nn.Sequential() + elif cdw: + dw1 = ConvBNRelu( + mid_depth, + mid_depth, + kernel=kernel, + stride=stride, + pad=(kernel // 2), + group=mid_depth, + no_bias=1, + use_relu="relu", + bn_type=bn_type, + ) + dw2 = ConvBNRelu( + mid_depth, + mid_depth, + kernel=kernel, + stride=1, + pad=(kernel // 2), + group=mid_depth, + no_bias=1, + use_relu="relu" if not dw_skip_relu else None, + bn_type=bn_type if not dw_skip_bn else None, + ) + self.dw = nn.Sequential(OrderedDict([("dw1", dw1), ("dw2", dw2)])) + else: + self.dw = ConvBNRelu( + mid_depth, + mid_depth, + kernel=kernel, + stride=stride, + pad=(kernel // 2), + group=mid_depth, + no_bias=1, + use_relu="relu" if not dw_skip_relu else None, + bn_type=bn_type if not dw_skip_bn else None, + ) + + # pw-linear + self.pwl = ConvBNRelu( + mid_depth, + output_depth, + kernel=1, + stride=1, + pad=0, + no_bias=1, + use_relu=None, + bn_type=bn_type, + group=pw_group, + ) + + self.shuffle_type = shuffle_type + if shuffle_type is not None: + self.shuffle = ChannelShuffle(pw_group) + + self.se4 = SEModule(output_depth) if se else nn.Sequential() + + self.output_depth = output_depth + + def forward(self, x): + y = self.pw(x) + if self.shuffle_type == "mid": + y = self.shuffle(y) + if self.upscale is not None: + y = self.upscale(y) + y = self.dw(y) + y = self.pwl(y) + if self.use_res_connect: + y += x + y = self.se4(y) + return y + + +def _expand_block_cfg(block_cfg): + assert isinstance(block_cfg, list) + ret = [] + for idx in range(block_cfg[2]): + cur = copy.deepcopy(block_cfg) + cur[2] = 1 + cur[3] = 1 if idx >= 1 else cur[3] + ret.append(cur) + return ret + + +def expand_stage_cfg(stage_cfg): + """ For a single stage """ + assert isinstance(stage_cfg, list) + ret = [] + for x in stage_cfg: + ret += _expand_block_cfg(x) + return ret + + +def expand_stages_cfg(stage_cfgs): + """ For a list of stages """ + assert isinstance(stage_cfgs, list) + ret = [] + for x in stage_cfgs: + ret.append(expand_stage_cfg(x)) + return ret + + +def _block_cfgs_to_list(block_cfgs): + assert isinstance(block_cfgs, list) + ret = [] + for stage_idx, stage in enumerate(block_cfgs): + stage = expand_stage_cfg(stage) + for block_idx, block in enumerate(stage): + cur = {"stage_idx": stage_idx, "block_idx": block_idx, "block": block} + ret.append(cur) + return ret + + +def _add_to_arch(arch, info, name): + """ arch = [{block_0}, {block_1}, ...] + info = [ + # stage 0 + [ + block0_info, + block1_info, + ... + ], ... + ] + convert to: + arch = [ + { + block_0, + name: block0_info, + }, + { + block_1, + name: block1_info, + }, ... + ] + """ + assert isinstance(arch, list) and all(isinstance(x, dict) for x in arch) + assert isinstance(info, list) and all(isinstance(x, list) for x in info) + idx = 0 + for stage_idx, stage in enumerate(info): + for block_idx, block in enumerate(stage): + assert ( + arch[idx]["stage_idx"] == stage_idx + and arch[idx]["block_idx"] == block_idx + ), "Index ({}, {}) does not match for block {}".format( + stage_idx, block_idx, arch[idx] + ) + assert name not in arch[idx] + arch[idx][name] = block + idx += 1 + + +def unify_arch_def(arch_def): + """ unify the arch_def to: + { + ..., + "arch": [ + { + "stage_idx": idx, + "block_idx": idx, + ... + }, + {}, ... + ] + } + """ + ret = copy.deepcopy(arch_def) + + assert "block_cfg" in arch_def and "stages" in arch_def["block_cfg"] + assert "stages" not in ret + # copy 'first', 'last' etc. inside arch_def['block_cfg'] to ret + ret.update({x: arch_def["block_cfg"][x] for x in arch_def["block_cfg"]}) + ret["stages"] = _block_cfgs_to_list(arch_def["block_cfg"]["stages"]) + del ret["block_cfg"] + + assert "block_op_type" in arch_def + _add_to_arch(ret["stages"], arch_def["block_op_type"], "block_op_type") + del ret["block_op_type"] + + return ret + + +def get_num_stages(arch_def): + ret = 0 + for x in arch_def["stages"]: + ret = max(x["stage_idx"], ret) + ret = ret + 1 + return ret + + +def get_blocks(arch_def, stage_indices=None, block_indices=None): + ret = copy.deepcopy(arch_def) + ret["stages"] = [] + for block in arch_def["stages"]: + keep = True + if stage_indices not in (None, []) and block["stage_idx"] not in stage_indices: + keep = False + if block_indices not in (None, []) and block["block_idx"] not in block_indices: + keep = False + if keep: + ret["stages"].append(block) + return ret + + +class FBNetBuilder(object): + def __init__( + self, + width_ratio, + bn_type="bn", + width_divisor=1, + dw_skip_bn=False, + dw_skip_relu=False, + ): + self.width_ratio = width_ratio + self.last_depth = -1 + self.bn_type = bn_type + self.width_divisor = width_divisor + self.dw_skip_bn = dw_skip_bn + self.dw_skip_relu = dw_skip_relu + + def add_first(self, stage_info, dim_in=3, pad=True): + # stage_info: [c, s, kernel] + assert len(stage_info) >= 2 + channel = stage_info[0] + stride = stage_info[1] + out_depth = self._get_divisible_width(int(channel * self.width_ratio)) + kernel = 3 + if len(stage_info) > 2: + kernel = stage_info[2] + + out = ConvBNRelu( + dim_in, + out_depth, + kernel=kernel, + stride=stride, + pad=kernel // 2 if pad else 0, + no_bias=1, + use_relu="relu", + bn_type=self.bn_type, + ) + self.last_depth = out_depth + return out + + def add_blocks(self, blocks): + """ blocks: [{}, {}, ...] + """ + assert isinstance(blocks, list) and all( + isinstance(x, dict) for x in blocks + ), blocks + + modules = OrderedDict() + for block in blocks: + stage_idx = block["stage_idx"] + block_idx = block["block_idx"] + block_op_type = block["block_op_type"] + tcns = block["block"] + n = tcns[2] + assert n == 1 + nnblock = self.add_ir_block(tcns, [block_op_type]) + nn_name = "xif{}_{}".format(stage_idx, block_idx) + assert nn_name not in modules + modules[nn_name] = nnblock + ret = nn.Sequential(modules) + return ret + + def add_last(self, stage_info): + """ skip last layer if channel_scale == 0 + use the same output channel if channel_scale < 0 + """ + assert len(stage_info) == 2 + channels = stage_info[0] + channel_scale = stage_info[1] + + if channel_scale == 0.0: + return nn.Sequential() + + if channel_scale > 0: + last_channel = ( + int(channels * self.width_ratio) if self.width_ratio > 1.0 else channels + ) + last_channel = int(last_channel * channel_scale) + else: + last_channel = int(self.last_depth * (-channel_scale)) + last_channel = self._get_divisible_width(last_channel) + + if last_channel == 0: + return nn.Sequential() + + dim_in = self.last_depth + ret = ConvBNRelu( + dim_in, + last_channel, + kernel=1, + stride=1, + pad=0, + no_bias=1, + use_relu="relu", + bn_type=self.bn_type, + ) + self.last_depth = last_channel + return ret + + # def add_final_pool(self, model, blob_in, kernel_size): + # ret = model.AveragePool(blob_in, "final_avg", kernel=kernel_size, stride=1) + # return ret + + def _add_ir_block( + self, dim_in, dim_out, stride, expand_ratio, block_op_type, **kwargs + ): + ret = PRIMITIVES[block_op_type]( + dim_in, + dim_out, + expansion=expand_ratio, + stride=stride, + bn_type=self.bn_type, + width_divisor=self.width_divisor, + dw_skip_bn=self.dw_skip_bn, + dw_skip_relu=self.dw_skip_relu, + **kwargs + ) + return ret, ret.output_depth + + def add_ir_block(self, tcns, block_op_types, **kwargs): + t, c, n, s = tcns + assert n == 1 + out_depth = self._get_divisible_width(int(c * self.width_ratio)) + dim_in = self.last_depth + op, ret_depth = self._add_ir_block( + dim_in, + out_depth, + stride=s, + expand_ratio=t, + block_op_type=block_op_types[0], + **kwargs + ) + self.last_depth = ret_depth + return op + + def _get_divisible_width(self, width): + ret = _get_divisible_by(int(width), self.width_divisor, self.width_divisor) + return ret diff --git a/maskrcnn_benchmark/modeling/backbone/fbnet_modeldef.py b/maskrcnn_benchmark/modeling/backbone/fbnet_modeldef.py new file mode 100644 index 0000000000000000000000000000000000000000..fb1c96b3a4dbe735682ae81361ee0efed75cbb25 --- /dev/null +++ b/maskrcnn_benchmark/modeling/backbone/fbnet_modeldef.py @@ -0,0 +1,218 @@ +from __future__ import absolute_import, division, print_function, unicode_literals + + +def add_archs(archs): + global MODEL_ARCH + for x in archs: + assert x not in MODEL_ARCH, "Duplicated model name {} existed".format(x) + MODEL_ARCH[x] = archs[x] + + +MODEL_ARCH = { + "default": { + "block_op_type": [ + # stage 0 + ["ir_k3"], + # stage 1 + ["ir_k3"] * 2, + # stage 2 + ["ir_k3"] * 3, + # stage 3 + ["ir_k3"] * 7, + # stage 4, bbox head + ["ir_k3"] * 4, + # stage 5, rpn + ["ir_k3"] * 3, + # stage 5, mask head + ["ir_k3"] * 5, + ], + "block_cfg": { + "first": [32, 2], + "stages": [ + # [t, c, n, s] + # stage 0 + [[1, 16, 1, 1]], + # stage 1 + [[6, 24, 2, 2]], + # stage 2 + [[6, 32, 3, 2]], + # stage 3 + [[6, 64, 4, 2], [6, 96, 3, 1]], + # stage 4, bbox head + [[4, 160, 1, 2], [6, 160, 2, 1], [6, 240, 1, 1]], + # [[6, 160, 3, 2], [6, 320, 1, 1]], + # stage 5, rpn head + [[6, 96, 3, 1]], + # stage 6, mask head + [[4, 160, 1, 1], [6, 160, 3, 1], [3, 80, 1, -2]], + ], + # [c, channel_scale] + "last": [0, 0.0], + "backbone": [0, 1, 2, 3], + "rpn": [5], + "bbox": [4], + "mask": [6], + }, + }, + "xirb16d_dsmask": { + "block_op_type": [ + # stage 0 + ["ir_k3"], + # stage 1 + ["ir_k3"] * 2, + # stage 2 + ["ir_k3"] * 3, + # stage 3 + ["ir_k3"] * 7, + # stage 4, bbox head + ["ir_k3"] * 4, + # stage 5, mask head + ["ir_k3"] * 5, + # stage 6, rpn + ["ir_k3"] * 3, + ], + "block_cfg": { + "first": [16, 2], + "stages": [ + # [t, c, n, s] + # stage 0 + [[1, 16, 1, 1]], + # stage 1 + [[6, 32, 2, 2]], + # stage 2 + [[6, 48, 3, 2]], + # stage 3 + [[6, 96, 4, 2], [6, 128, 3, 1]], + # stage 4, bbox head + [[4, 128, 1, 2], [6, 128, 2, 1], [6, 160, 1, 1]], + # stage 5, mask head + [[4, 128, 1, 2], [6, 128, 2, 1], [6, 128, 1, -2], [3, 64, 1, -2]], + # stage 6, rpn head + [[6, 128, 3, 1]], + ], + # [c, channel_scale] + "last": [0, 0.0], + "backbone": [0, 1, 2, 3], + "rpn": [6], + "bbox": [4], + "mask": [5], + }, + }, + "mobilenet_v2": { + "block_op_type": [ + # stage 0 + ["ir_k3"], + # stage 1 + ["ir_k3"] * 2, + # stage 2 + ["ir_k3"] * 3, + # stage 3 + ["ir_k3"] * 7, + # stage 4 + ["ir_k3"] * 4, + ], + "block_cfg": { + "first": [32, 2], + "stages": [ + # [t, c, n, s] + # stage 0 + [[1, 16, 1, 1]], + # stage 1 + [[6, 24, 2, 2]], + # stage 2 + [[6, 32, 3, 2]], + # stage 3 + [[6, 64, 4, 2], [6, 96, 3, 1]], + # stage 4 + [[6, 160, 3, 1], [6, 320, 1, 1]], + ], + # [c, channel_scale] + "last": [0, 0.0], + "backbone": [0, 1, 2, 3], + "bbox": [4], + }, + }, +} + + +MODEL_ARCH_CHAM = { + "cham_v1a": { + "block_op_type": [ + # stage 0 + ["ir_k3"], + # stage 1 + ["ir_k7"] * 2, + # stage 2 + ["ir_k3"] * 5, + # stage 3 + ["ir_k5"] * 7 + ["ir_k3"] * 5, + # stage 4, bbox head + ["ir_k3"] * 5, + # stage 5, rpn + ["ir_k3"] * 3, + ], + "block_cfg": { + "first": [32, 2], + "stages": [ + # [t, c, n, s] + # stage 0 + [[1, 24, 1, 1]], + # stage 1 + [[4, 48, 2, 2]], + # stage 2 + [[7, 64, 5, 2]], + # stage 3 + [[12, 56, 7, 2], [8, 88, 5, 1]], + # stage 4, bbox head + [[7, 152, 4, 2], [10, 104, 1, 1]], + # stage 5, rpn head + [[8, 88, 3, 1]], + ], + # [c, channel_scale] + "last": [0, 0.0], + "backbone": [0, 1, 2, 3], + "rpn": [5], + "bbox": [4], + }, + }, + "cham_v2": { + "block_op_type": [ + # stage 0 + ["ir_k3"], + # stage 1 + ["ir_k5"] * 4, + # stage 2 + ["ir_k7"] * 6, + # stage 3 + ["ir_k5"] * 3 + ["ir_k3"] * 6, + # stage 4, bbox head + ["ir_k3"] * 7, + # stage 5, rpn + ["ir_k3"] * 1, + ], + "block_cfg": { + "first": [32, 2], + "stages": [ + # [t, c, n, s] + # stage 0 + [[1, 24, 1, 1]], + # stage 1 + [[8, 32, 4, 2]], + # stage 2 + [[5, 48, 6, 2]], + # stage 3 + [[9, 56, 3, 2], [6, 56, 6, 1]], + # stage 4, bbox head + [[2, 160, 6, 2], [6, 112, 1, 1]], + # stage 5, rpn head + [[6, 56, 1, 1]], + ], + # [c, channel_scale] + "last": [0, 0.0], + "backbone": [0, 1, 2, 3], + "rpn": [5], + "bbox": [4], + }, + }, +} +add_archs(MODEL_ARCH_CHAM) diff --git a/maskrcnn_benchmark/modeling/backbone/fpn.py b/maskrcnn_benchmark/modeling/backbone/fpn.py new file mode 100644 index 0000000000000000000000000000000000000000..edf4d04b7012ce68a5db816df11ffae1b70cb83a --- /dev/null +++ b/maskrcnn_benchmark/modeling/backbone/fpn.py @@ -0,0 +1,99 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import torch +import torch.nn.functional as F +from torch import nn + + +class FPN(nn.Module): + """ + Module that adds FPN on top of a list of feature maps. + The feature maps are currently supposed to be in increasing depth + order, and must be consecutive + """ + + def __init__( + self, in_channels_list, out_channels, conv_block, top_blocks=None + ): + """ + Arguments: + in_channels_list (list[int]): number of channels for each feature map that + will be fed + out_channels (int): number of channels of the FPN representation + top_blocks (nn.Module or None): if provided, an extra operation will + be performed on the output of the last (smallest resolution) + FPN output, and the result will extend the result list + """ + super(FPN, self).__init__() + self.inner_blocks = [] + self.layer_blocks = [] + for idx, in_channels in enumerate(in_channels_list, 1): + inner_block = "fpn_inner{}".format(idx) + layer_block = "fpn_layer{}".format(idx) + + if in_channels == 0: + continue + inner_block_module = conv_block(in_channels, out_channels, 1) + layer_block_module = conv_block(out_channels, out_channels, 3, 1) + self.add_module(inner_block, inner_block_module) + self.add_module(layer_block, layer_block_module) + self.inner_blocks.append(inner_block) + self.layer_blocks.append(layer_block) + self.top_blocks = top_blocks + + def forward(self, x): + """ + Arguments: + x (list[Tensor]): feature maps for each feature level. + Returns: + results (tuple[Tensor]): feature maps after FPN layers. + They are ordered from highest resolution first. + """ + last_inner = getattr(self, self.inner_blocks[-1])(x[-1]) + results = [] + results.append(getattr(self, self.layer_blocks[-1])(last_inner)) + for feature, inner_block, layer_block in zip( + x[:-1][::-1], self.inner_blocks[:-1][::-1], self.layer_blocks[:-1][::-1] + ): + if not inner_block: + continue + inner_top_down = F.interpolate(last_inner, scale_factor=2, mode="nearest") + inner_lateral = getattr(self, inner_block)(feature) + # TODO use size instead of scale to make it robust to different sizes + # inner_top_down = F.upsample(last_inner, size=inner_lateral.shape[-2:], + # mode='bilinear', align_corners=False) + last_inner = inner_lateral + inner_top_down + results.insert(0, getattr(self, layer_block)(last_inner)) + + if isinstance(self.top_blocks, LastLevelP6P7): + last_results = self.top_blocks(x[-1], results[-1]) + results.extend(last_results) + elif isinstance(self.top_blocks, LastLevelMaxPool): + last_results = self.top_blocks(results[-1]) + results.extend(last_results) + + return tuple(results) + + +class LastLevelMaxPool(nn.Module): + def forward(self, x): + return [F.max_pool2d(x, 1, 2, 0)] + + +class LastLevelP6P7(nn.Module): + """ + This module is used in RetinaNet to generate extra layers, P6 and P7. + """ + def __init__(self, in_channels, out_channels): + super(LastLevelP6P7, self).__init__() + self.p6 = nn.Conv2d(in_channels, out_channels, 3, 2, 1) + self.p7 = nn.Conv2d(out_channels, out_channels, 3, 2, 1) + for module in [self.p6, self.p7]: + nn.init.kaiming_uniform_(module.weight, a=1) + nn.init.constant_(module.bias, 0) + self.use_P5 = in_channels == out_channels + + def forward(self, c5, p5): + x = p5 if self.use_P5 else c5 + p6 = self.p6(x) + p7 = self.p7(F.relu(p6)) + return [p6, p7] diff --git a/maskrcnn_benchmark/modeling/backbone/resnet.py b/maskrcnn_benchmark/modeling/backbone/resnet.py new file mode 100644 index 0000000000000000000000000000000000000000..15d96720cbb0c44d173215894d0b1599480eb94b --- /dev/null +++ b/maskrcnn_benchmark/modeling/backbone/resnet.py @@ -0,0 +1,418 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +""" +Variant of the resnet module that takes cfg as an argument. +Example usage. Strings may be specified in the config file. + model = ResNet( + "StemWithFixedBatchNorm", + "BottleneckWithFixedBatchNorm", + "ResNet50StagesTo4", + ) +OR: + model = ResNet( + "StemWithGN", + "BottleneckWithGN", + "ResNet50StagesTo4", + ) +Custom implementations may be written in user code and hooked in via the +`register_*` functions. +""" +from collections import namedtuple + +import torch +import torch.nn.functional as F +from torch import nn + +from maskrcnn_benchmark.layers import FrozenBatchNorm2d +from maskrcnn_benchmark.layers import Conv2d +from maskrcnn_benchmark.modeling.make_layers import group_norm +from maskrcnn_benchmark.utils.registry import Registry + + +# ResNet stage specification +StageSpec = namedtuple( + "StageSpec", + [ + "index", # Index of the stage, eg 1, 2, ..,. 5 + "block_count", # Numer of residual blocks in the stage + "return_features", # True => return the last feature map from this stage + ], +) + +# ----------------------------------------------------------------------------- +# Standard ResNet models +# ----------------------------------------------------------------------------- +# ResNet-50 (including all stages) +ResNet50StagesTo5 = tuple( + StageSpec(index=i, block_count=c, return_features=r) + for (i, c, r) in ((1, 3, False), (2, 4, False), (3, 6, False), (4, 3, True)) +) +# ResNet-50 up to stage 4 (excludes stage 5) +ResNet50StagesTo4 = tuple( + StageSpec(index=i, block_count=c, return_features=r) + for (i, c, r) in ((1, 3, False), (2, 4, False), (3, 6, True)) +) +# ResNet-101 (including all stages) +ResNet101StagesTo5 = tuple( + StageSpec(index=i, block_count=c, return_features=r) + for (i, c, r) in ((1, 3, False), (2, 4, False), (3, 23, False), (4, 3, True)) +) +# ResNet-101 up to stage 4 (excludes stage 5) +ResNet101StagesTo4 = tuple( + StageSpec(index=i, block_count=c, return_features=r) + for (i, c, r) in ((1, 3, False), (2, 4, False), (3, 23, True)) +) +# ResNet-50-FPN (including all stages) +ResNet50FPNStagesTo5 = tuple( + StageSpec(index=i, block_count=c, return_features=r) + for (i, c, r) in ((1, 3, True), (2, 4, True), (3, 6, True), (4, 3, True)) +) +# ResNet-101-FPN (including all stages) +ResNet101FPNStagesTo5 = tuple( + StageSpec(index=i, block_count=c, return_features=r) + for (i, c, r) in ((1, 3, True), (2, 4, True), (3, 23, True), (4, 3, True)) +) +# ResNet-152-FPN (including all stages) +ResNet152FPNStagesTo5 = tuple( + StageSpec(index=i, block_count=c, return_features=r) + for (i, c, r) in ((1, 3, True), (2, 8, True), (3, 36, True), (4, 3, True)) +) + +class ResNet(nn.Module): + def __init__(self, cfg): + super(ResNet, self).__init__() + + # If we want to use the cfg in forward(), then we should make a copy + # of it and store it for later use: + # self.cfg = cfg.clone() + + # Translate string names to implementations + stem_module = _STEM_MODULES[cfg.MODEL.RESNETS.STEM_FUNC] + stage_specs = _STAGE_SPECS[cfg.MODEL.BACKBONE.CONV_BODY] + transformation_module = _TRANSFORMATION_MODULES[cfg.MODEL.RESNETS.TRANS_FUNC] + + # Construct the stem module + self.stem = stem_module(cfg) + + # Constuct the specified ResNet stages + num_groups = cfg.MODEL.RESNETS.NUM_GROUPS + width_per_group = cfg.MODEL.RESNETS.WIDTH_PER_GROUP + in_channels = cfg.MODEL.RESNETS.STEM_OUT_CHANNELS + stage2_bottleneck_channels = num_groups * width_per_group + stage2_out_channels = cfg.MODEL.RESNETS.RES2_OUT_CHANNELS + self.stages = [] + self.return_features = {} + for stage_spec in stage_specs: + name = "layer" + str(stage_spec.index) + stage2_relative_factor = 2 ** (stage_spec.index - 1) + bottleneck_channels = stage2_bottleneck_channels * stage2_relative_factor + out_channels = stage2_out_channels * stage2_relative_factor + module = _make_stage( + transformation_module, + in_channels, + bottleneck_channels, + out_channels, + stage_spec.block_count, + num_groups, + cfg.MODEL.RESNETS.STRIDE_IN_1X1, + first_stride=int(stage_spec.index > 1) + 1, + ) + in_channels = out_channels + self.add_module(name, module) + self.stages.append(name) + self.return_features[name] = stage_spec.return_features + + # Optionally freeze (requires_grad=False) parts of the backbone + self._freeze_backbone(cfg.MODEL.BACKBONE.FREEZE_CONV_BODY_AT) + + def _freeze_backbone(self, freeze_at): + if freeze_at < 0: + return + for stage_index in range(freeze_at): + if stage_index == 0: + m = self.stem # stage 0 is the stem + else: + m = getattr(self, "layer" + str(stage_index)) + for p in m.parameters(): + p.requires_grad = False + + def forward(self, x): + outputs = [] + x = self.stem(x) + for stage_name in self.stages: + x = getattr(self, stage_name)(x) + if self.return_features[stage_name]: + outputs.append(x) + return outputs + + +class ResNetHead(nn.Module): + def __init__( + self, + block_module, + stages, + num_groups=1, + width_per_group=64, + stride_in_1x1=True, + stride_init=None, + res2_out_channels=256, + dilation=1 + ): + super(ResNetHead, self).__init__() + + stage2_relative_factor = 2 ** (stages[0].index - 1) + stage2_bottleneck_channels = num_groups * width_per_group + out_channels = res2_out_channels * stage2_relative_factor + in_channels = out_channels // 2 + bottleneck_channels = stage2_bottleneck_channels * stage2_relative_factor + + block_module = _TRANSFORMATION_MODULES[block_module] + + self.stages = [] + stride = stride_init + for stage in stages: + name = "layer" + str(stage.index) + if not stride: + stride = int(stage.index > 1) + 1 + module = _make_stage( + block_module, + in_channels, + bottleneck_channels, + out_channels, + stage.block_count, + num_groups, + stride_in_1x1, + first_stride=stride, + dilation=dilation + ) + stride = None + self.add_module(name, module) + self.stages.append(name) + self.out_channels = out_channels + + def forward(self, x): + for stage in self.stages: + x = getattr(self, stage)(x) + return x + + +def _make_stage( + transformation_module, + in_channels, + bottleneck_channels, + out_channels, + block_count, + num_groups, + stride_in_1x1, + first_stride, + dilation=1 +): + blocks = [] + stride = first_stride + for _ in range(block_count): + blocks.append( + transformation_module( + in_channels, + bottleneck_channels, + out_channels, + num_groups, + stride_in_1x1, + stride, + dilation=dilation + ) + ) + stride = 1 + in_channels = out_channels + return nn.Sequential(*blocks) + + +class Bottleneck(nn.Module): + def __init__( + self, + in_channels, + bottleneck_channels, + out_channels, + num_groups, + stride_in_1x1, + stride, + dilation, + norm_func + ): + super(Bottleneck, self).__init__() + + self.downsample = None + if in_channels != out_channels: + down_stride = stride if dilation == 1 else 1 + self.downsample = nn.Sequential( + Conv2d( + in_channels, out_channels, + kernel_size=1, stride=down_stride, bias=False + ), + norm_func(out_channels), + ) + for modules in [self.downsample,]: + for l in modules.modules(): + if isinstance(l, Conv2d): + nn.init.kaiming_uniform_(l.weight, a=1) + + if dilation > 1: + stride = 1 # reset to be 1 + + # The original MSRA ResNet models have stride in the first 1x1 conv + # The subsequent fb.torch.resnet and Caffe2 ResNe[X]t implementations have + # stride in the 3x3 conv + stride_1x1, stride_3x3 = (stride, 1) if stride_in_1x1 else (1, stride) + + self.conv1 = Conv2d( + in_channels, + bottleneck_channels, + kernel_size=1, + stride=stride_1x1, + bias=False, + ) + self.bn1 = norm_func(bottleneck_channels) + # TODO: specify init for the above + + self.conv2 = Conv2d( + bottleneck_channels, + bottleneck_channels, + kernel_size=3, + stride=stride_3x3, + padding=dilation, + bias=False, + groups=num_groups, + dilation=dilation + ) + self.bn2 = norm_func(bottleneck_channels) + + self.conv3 = Conv2d( + bottleneck_channels, out_channels, kernel_size=1, bias=False + ) + self.bn3 = norm_func(out_channels) + + for l in [self.conv1, self.conv2, self.conv3,]: + nn.init.kaiming_uniform_(l.weight, a=1) + + def forward(self, x): + identity = x + + out = self.conv1(x) + out = self.bn1(out) + out = F.relu_(out) + + out = self.conv2(out) + out = self.bn2(out) + out = F.relu_(out) + + out0 = self.conv3(out) + out = self.bn3(out0) + + if self.downsample is not None: + identity = self.downsample(x) + + out += identity + out = F.relu_(out) + + return out + + +class BaseStem(nn.Module): + def __init__(self, cfg, norm_func): + super(BaseStem, self).__init__() + + out_channels = cfg.MODEL.RESNETS.STEM_OUT_CHANNELS + + self.conv1 = Conv2d( + 3, out_channels, kernel_size=7, stride=2, padding=3, bias=False + ) + self.bn1 = norm_func(out_channels) + + for l in [self.conv1,]: + nn.init.kaiming_uniform_(l.weight, a=1) + + def forward(self, x): + x = self.conv1(x) + x = self.bn1(x) + x = F.relu_(x) + x = F.max_pool2d(x, kernel_size=3, stride=2, padding=1) + return x + + +class BottleneckWithFixedBatchNorm(Bottleneck): + def __init__( + self, + in_channels, + bottleneck_channels, + out_channels, + num_groups=1, + stride_in_1x1=True, + stride=1, + dilation=1 + ): + super(BottleneckWithFixedBatchNorm, self).__init__( + in_channels=in_channels, + bottleneck_channels=bottleneck_channels, + out_channels=out_channels, + num_groups=num_groups, + stride_in_1x1=stride_in_1x1, + stride=stride, + dilation=dilation, + norm_func=FrozenBatchNorm2d + ) + + +class StemWithFixedBatchNorm(BaseStem): + def __init__(self, cfg): + super(StemWithFixedBatchNorm, self).__init__( + cfg, norm_func=FrozenBatchNorm2d + ) + + +class BottleneckWithGN(Bottleneck): + def __init__( + self, + in_channels, + bottleneck_channels, + out_channels, + num_groups=1, + stride_in_1x1=True, + stride=1, + dilation=1 + ): + super(BottleneckWithGN, self).__init__( + in_channels=in_channels, + bottleneck_channels=bottleneck_channels, + out_channels=out_channels, + num_groups=num_groups, + stride_in_1x1=stride_in_1x1, + stride=stride, + dilation=dilation, + norm_func=group_norm + ) + + +class StemWithGN(BaseStem): + def __init__(self, cfg): + super(StemWithGN, self).__init__(cfg, norm_func=group_norm) + + +_TRANSFORMATION_MODULES = Registry({ + "BottleneckWithFixedBatchNorm": BottleneckWithFixedBatchNorm, + "BottleneckWithGN": BottleneckWithGN, +}) + +_STEM_MODULES = Registry({ + "StemWithFixedBatchNorm": StemWithFixedBatchNorm, + "StemWithGN": StemWithGN, +}) + +_STAGE_SPECS = Registry({ + "R-50-C4": ResNet50StagesTo4, + "R-50-C5": ResNet50StagesTo5, + "R-101-C4": ResNet101StagesTo4, + "R-101-C5": ResNet101StagesTo5, + "R-50-FPN": ResNet50FPNStagesTo5, + "R-50-FPN-RETINANET": ResNet50FPNStagesTo5, + "R-101-FPN": ResNet101FPNStagesTo5, + "R-101-FPN-RETINANET": ResNet101FPNStagesTo5, + "R-152-FPN": ResNet152FPNStagesTo5, +}) diff --git a/maskrcnn_benchmark/modeling/balanced_positive_negative_sampler.py b/maskrcnn_benchmark/modeling/balanced_positive_negative_sampler.py new file mode 100644 index 0000000000000000000000000000000000000000..c0bd00444d3b1bdefa1a4015e8e6af72166817cf --- /dev/null +++ b/maskrcnn_benchmark/modeling/balanced_positive_negative_sampler.py @@ -0,0 +1,68 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import torch + + +class BalancedPositiveNegativeSampler(object): + """ + This class samples batches, ensuring that they contain a fixed proportion of positives + """ + + def __init__(self, batch_size_per_image, positive_fraction): + """ + Arguments: + batch_size_per_image (int): number of elements to be selected per image + positive_fraction (float): percentace of positive elements per batch + """ + self.batch_size_per_image = batch_size_per_image + self.positive_fraction = positive_fraction + + def __call__(self, matched_idxs): + """ + Arguments: + matched idxs: list of tensors containing -1, 0 or positive values. + Each tensor corresponds to a specific image. + -1 values are ignored, 0 are considered as negatives and > 0 as + positives. + + Returns: + pos_idx (list[tensor]) + neg_idx (list[tensor]) + + Returns two lists of binary masks for each image. + The first list contains the positive elements that were selected, + and the second list the negative example. + """ + pos_idx = [] + neg_idx = [] + for matched_idxs_per_image in matched_idxs: + positive = torch.nonzero(matched_idxs_per_image >= 1).squeeze(1) + negative = torch.nonzero(matched_idxs_per_image == 0).squeeze(1) + + num_pos = int(self.batch_size_per_image * self.positive_fraction) + # protect against not enough positive examples + num_pos = min(positive.numel(), num_pos) + num_neg = self.batch_size_per_image - num_pos + # protect against not enough negative examples + num_neg = min(negative.numel(), num_neg) + + # randomly select positive and negative examples + perm1 = torch.randperm(positive.numel(), device=positive.device)[:num_pos] + perm2 = torch.randperm(negative.numel(), device=negative.device)[:num_neg] + + pos_idx_per_image = positive[perm1] + neg_idx_per_image = negative[perm2] + + # create binary mask from indices + pos_idx_per_image_mask = torch.zeros_like( + matched_idxs_per_image, dtype=torch.uint8 + ) + neg_idx_per_image_mask = torch.zeros_like( + matched_idxs_per_image, dtype=torch.uint8 + ) + pos_idx_per_image_mask[pos_idx_per_image] = 1 + neg_idx_per_image_mask[neg_idx_per_image] = 1 + + pos_idx.append(pos_idx_per_image_mask) + neg_idx.append(neg_idx_per_image_mask) + + return pos_idx, neg_idx diff --git a/maskrcnn_benchmark/modeling/box_coder.py b/maskrcnn_benchmark/modeling/box_coder.py new file mode 100644 index 0000000000000000000000000000000000000000..46a4acb3247003da2e6e24a4d28deb86de7d7aae --- /dev/null +++ b/maskrcnn_benchmark/modeling/box_coder.py @@ -0,0 +1,95 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import math + +import torch + + +class BoxCoder(object): + """ + This class encodes and decodes a set of bounding boxes into + the representation used for training the regressors. + """ + + def __init__(self, weights, bbox_xform_clip=math.log(1000. / 16)): + """ + Arguments: + weights (4-element tuple) + bbox_xform_clip (float) + """ + self.weights = weights + self.bbox_xform_clip = bbox_xform_clip + + def encode(self, reference_boxes, proposals): + """ + Encode a set of proposals with respect to some + reference boxes + + Arguments: + reference_boxes (Tensor): reference boxes + proposals (Tensor): boxes to be encoded + """ + + TO_REMOVE = 1 # TODO remove + ex_widths = proposals[:, 2] - proposals[:, 0] + TO_REMOVE + ex_heights = proposals[:, 3] - proposals[:, 1] + TO_REMOVE + ex_ctr_x = proposals[:, 0] + 0.5 * ex_widths + ex_ctr_y = proposals[:, 1] + 0.5 * ex_heights + + gt_widths = reference_boxes[:, 2] - reference_boxes[:, 0] + TO_REMOVE + gt_heights = reference_boxes[:, 3] - reference_boxes[:, 1] + TO_REMOVE + gt_ctr_x = reference_boxes[:, 0] + 0.5 * gt_widths + gt_ctr_y = reference_boxes[:, 1] + 0.5 * gt_heights + + wx, wy, ww, wh = self.weights + targets_dx = wx * (gt_ctr_x - ex_ctr_x) / ex_widths + targets_dy = wy * (gt_ctr_y - ex_ctr_y) / ex_heights + targets_dw = ww * torch.log(gt_widths / ex_widths) + targets_dh = wh * torch.log(gt_heights / ex_heights) + + targets = torch.stack((targets_dx, targets_dy, targets_dw, targets_dh), dim=1) + return targets + + def decode(self, rel_codes, boxes): + """ + From a set of original boxes and encoded relative box offsets, + get the decoded boxes. + + Arguments: + rel_codes (Tensor): encoded boxes + boxes (Tensor): reference boxes. + """ + + boxes = boxes.to(rel_codes.dtype) + + TO_REMOVE = 1 # TODO remove + widths = boxes[:, 2] - boxes[:, 0] + TO_REMOVE + heights = boxes[:, 3] - boxes[:, 1] + TO_REMOVE + ctr_x = boxes[:, 0] + 0.5 * widths + ctr_y = boxes[:, 1] + 0.5 * heights + + wx, wy, ww, wh = self.weights + dx = rel_codes[:, 0::4] / wx + dy = rel_codes[:, 1::4] / wy + dw = rel_codes[:, 2::4] / ww + dh = rel_codes[:, 3::4] / wh + + # Prevent sending too large values into torch.exp() + dw = torch.clamp(dw, max=self.bbox_xform_clip) + dh = torch.clamp(dh, max=self.bbox_xform_clip) + + pred_ctr_x = dx * widths[:, None] + ctr_x[:, None] + pred_ctr_y = dy * heights[:, None] + ctr_y[:, None] + pred_w = torch.exp(dw) * widths[:, None] + pred_h = torch.exp(dh) * heights[:, None] + + pred_boxes = torch.zeros_like(rel_codes) + # x1 + pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w + # y1 + pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h + # x2 (note: "- 1" is correct; don't be fooled by the asymmetry) + pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w - 1 + # y2 (note: "- 1" is correct; don't be fooled by the asymmetry) + pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h - 1 + + return pred_boxes diff --git a/maskrcnn_benchmark/modeling/detector/__init__.py b/maskrcnn_benchmark/modeling/detector/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ff421e281e16e6623bab2551b242ea003d1f2166 --- /dev/null +++ b/maskrcnn_benchmark/modeling/detector/__init__.py @@ -0,0 +1,2 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +from .detectors import build_detection_model diff --git a/maskrcnn_benchmark/modeling/detector/adaption_rcnn.py b/maskrcnn_benchmark/modeling/detector/adaption_rcnn.py new file mode 100644 index 0000000000000000000000000000000000000000..c9f5151cde247586a4e94dd6948ef7165962a73e --- /dev/null +++ b/maskrcnn_benchmark/modeling/detector/adaption_rcnn.py @@ -0,0 +1,65 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +""" +Implements the Generalized R-CNN framework +""" + +import torch +from torch import nn + +from maskrcnn_benchmark.structures.image_list import to_image_list + +from ..backbone import build_backbone +from ..rpn.rpn import build_rpn +from ..roi_heads.roi_heads import build_roi_heads + + +class AdaptionRCNN(nn.Module): + """ + Main class for Generalized R-CNN. Currently supports boxes and masks. + It consists of three main parts: + - backbone + - rpn + - heads: takes the features + the proposals from the RPN and computes + detections / masks from it. + """ + + def __init__(self, cfg): + super(AdaptionRCNN, self).__init__() + + self.backbone = build_backbone(cfg) + self.rpn = build_rpn(cfg, self.backbone.out_channels) + self.roi_heads = build_roi_heads(cfg, self.backbone.out_channels) + + self.generate_pseudo_labels = False + + def forward(self, images, targets=None, is_target_domain=False): + """ + Arguments: + images (list[Tensor] or ImageList): images to be processed + targets (list[BoxList]): ground-truth boxes present in the image (optional) + is_target_domain: synthesized images or not + + Returns: + result (list[BoxList] or dict[Tensor]): the output from the model. + During training, it returns a dict[Tensor] which contains the losses. + During testing, it returns list[BoxList] contains additional fields + like `scores`, `labels` and `mask` (for Mask R-CNN models). + + """ + if self.training and targets is None: + raise ValueError("In training mode,targets should be passed") + + # extract shared deep features cross domain + images = to_image_list(images) + features = self.backbone(images.tensors) + + proposals, proposal_losses = self.rpn(images, features, targets) + x, result, detector_losses = self.roi_heads(features, proposals, targets) + + if self.training: + losses = {} + losses.update(detector_losses) + losses.update(proposal_losses) + return losses + + return result diff --git a/maskrcnn_benchmark/modeling/detector/detectors.py b/maskrcnn_benchmark/modeling/detector/detectors.py new file mode 100644 index 0000000000000000000000000000000000000000..df3b04a4fa3cd1842f890dbcc4067d8c2cd6553e --- /dev/null +++ b/maskrcnn_benchmark/modeling/detector/detectors.py @@ -0,0 +1,13 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +from .generalized_rcnn import GeneralizedRCNN +from .adaption_rcnn import AdaptionRCNN + +_DETECTION_META_ARCHITECTURES = { + "GeneralizedRCNN": GeneralizedRCNN, + "AdaptionRCNN": AdaptionRCNN, +} + + +def build_detection_model(cfg): + meta_arch = _DETECTION_META_ARCHITECTURES[cfg.MODEL.META_ARCHITECTURE] + return meta_arch(cfg) diff --git a/maskrcnn_benchmark/modeling/detector/generalized_rcnn.py b/maskrcnn_benchmark/modeling/detector/generalized_rcnn.py new file mode 100644 index 0000000000000000000000000000000000000000..38dfd3af2a0d1a77423054c3d05a85af93fcfb02 --- /dev/null +++ b/maskrcnn_benchmark/modeling/detector/generalized_rcnn.py @@ -0,0 +1,65 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +""" +Implements the Generalized R-CNN framework +""" + +import torch +from torch import nn + +from maskrcnn_benchmark.structures.image_list import to_image_list + +from ..backbone import build_backbone +from ..rpn.rpn import build_rpn +from ..roi_heads.roi_heads import build_roi_heads + + +class GeneralizedRCNN(nn.Module): + """ + Main class for Generalized R-CNN. Currently supports boxes and masks. + It consists of three main parts: + - backbone + - rpn + - heads: takes the features + the proposals from the RPN and computes + detections / masks from it. + """ + + def __init__(self, cfg): + super(GeneralizedRCNN, self).__init__() + + self.backbone = build_backbone(cfg) + self.rpn = build_rpn(cfg, self.backbone.out_channels) + self.roi_heads = build_roi_heads(cfg, self.backbone.out_channels) + + def forward(self, images, targets=None): + """ + Arguments: + images (list[Tensor] or ImageList): images to be processed + targets (list[BoxList]): ground-truth boxes present in the image (optional) + + Returns: + result (list[BoxList] or dict[Tensor]): the output from the model. + During training, it returns a dict[Tensor] which contains the losses. + During testing, it returns list[BoxList] contains additional fields + like `scores`, `labels` and `mask` (for Mask R-CNN models). + + """ + if self.training and targets is None: + raise ValueError("In training mode, targets should be passed") + images = to_image_list(images) + features = self.backbone(images.tensors) + proposals, proposal_losses = self.rpn(images, features, targets) + if self.roi_heads: + x, result, detector_losses = self.roi_heads(features, proposals, targets) + else: + # RPN-only models don't have roi_heads + x = features + result = proposals + detector_losses = {} + + if self.training: + losses = {} + losses.update(detector_losses) + losses.update(proposal_losses) + return losses + + return result diff --git a/maskrcnn_benchmark/modeling/make_layers.py b/maskrcnn_benchmark/modeling/make_layers.py new file mode 100644 index 0000000000000000000000000000000000000000..74e56b0e2b25111604e2858ab12a1e7ffc8dcf6a --- /dev/null +++ b/maskrcnn_benchmark/modeling/make_layers.py @@ -0,0 +1,122 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +""" +Miscellaneous utility functions +""" + +import torch +from torch import nn +from torch.nn import functional as F +from maskrcnn_benchmark.config import cfg +from maskrcnn_benchmark.layers import Conv2d +from maskrcnn_benchmark.modeling.poolers import Pooler + + +def get_group_gn(dim, dim_per_gp, num_groups): + """get number of groups used by GroupNorm, based on number of channels.""" + assert dim_per_gp == -1 or num_groups == -1, \ + "GroupNorm: can only specify G or C/G." + + if dim_per_gp > 0: + assert dim % dim_per_gp == 0, \ + "dim: {}, dim_per_gp: {}".format(dim, dim_per_gp) + group_gn = dim // dim_per_gp + else: + assert dim % num_groups == 0, \ + "dim: {}, num_groups: {}".format(dim, num_groups) + group_gn = num_groups + + return group_gn + + +def group_norm(out_channels, affine=True, divisor=1): + out_channels = out_channels // divisor + dim_per_gp = cfg.MODEL.GROUP_NORM.DIM_PER_GP // divisor + num_groups = cfg.MODEL.GROUP_NORM.NUM_GROUPS // divisor + eps = cfg.MODEL.GROUP_NORM.EPSILON # default: 1e-5 + return torch.nn.GroupNorm( + get_group_gn(out_channels, dim_per_gp, num_groups), + out_channels, + eps, + affine + ) + + +def make_conv3x3( + in_channels, + out_channels, + dilation=1, + stride=1, + use_gn=False, + use_relu=False, + kaiming_init=True +): + conv = Conv2d( + in_channels, + out_channels, + kernel_size=3, + stride=stride, + padding=dilation, + dilation=dilation, + bias=False if use_gn else True + ) + if kaiming_init: + nn.init.kaiming_normal_( + conv.weight, mode="fan_out", nonlinearity="relu" + ) + else: + torch.nn.init.normal_(conv.weight, std=0.01) + if not use_gn: + nn.init.constant_(conv.bias, 0) + module = [conv,] + if use_gn: + module.append(group_norm(out_channels)) + if use_relu: + module.append(nn.ReLU(inplace=True)) + if len(module) > 1: + return nn.Sequential(*module) + return conv + + +def make_fc(dim_in, hidden_dim, use_gn=False): + ''' + Caffe2 implementation uses XavierFill, which in fact + corresponds to kaiming_uniform_ in PyTorch + ''' + if use_gn: + fc = nn.Linear(dim_in, hidden_dim, bias=False) + nn.init.kaiming_uniform_(fc.weight, a=1) + return nn.Sequential(fc, group_norm(hidden_dim)) + fc = nn.Linear(dim_in, hidden_dim) + nn.init.kaiming_uniform_(fc.weight, a=1) + nn.init.constant_(fc.bias, 0) + return fc + + +def conv_with_kaiming_uniform(use_gn=False, use_relu=False): + def make_conv( + in_channels, out_channels, kernel_size, stride=1, dilation=1 + ): + conv = Conv2d( + in_channels, + out_channels, + kernel_size=kernel_size, + stride=stride, + padding=dilation * (kernel_size - 1) // 2, + dilation=dilation, + bias=False if use_gn else True + ) + # Caffe2 implementation uses XavierFill, which in fact + # corresponds to kaiming_uniform_ in PyTorch + nn.init.kaiming_uniform_(conv.weight, a=1) + if not use_gn: + nn.init.constant_(conv.bias, 0) + module = [conv,] + if use_gn: + module.append(group_norm(out_channels)) + if use_relu: + module.append(nn.ReLU(inplace=True)) + if len(module) > 1: + return nn.Sequential(*module) + return conv + + return make_conv diff --git a/maskrcnn_benchmark/modeling/matcher.py b/maskrcnn_benchmark/modeling/matcher.py new file mode 100644 index 0000000000000000000000000000000000000000..35ec5f1fe819526055c10607f05d47ac88277de6 --- /dev/null +++ b/maskrcnn_benchmark/modeling/matcher.py @@ -0,0 +1,112 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import torch + + +class Matcher(object): + """ + This class assigns to each predicted "element" (e.g., a box) a ground-truth + element. Each predicted element will have exactly zero or one matches; each + ground-truth element may be assigned to zero or more predicted elements. + + Matching is based on the MxN match_quality_matrix, that characterizes how well + each (ground-truth, predicted)-pair match. For example, if the elements are + boxes, the matrix may contain box IoU overlap values. + + The matcher returns a tensor of size N containing the index of the ground-truth + element m that matches to prediction n. If there is no match, a negative value + is returned. + """ + + BELOW_LOW_THRESHOLD = -1 + BETWEEN_THRESHOLDS = -2 + + def __init__(self, high_threshold, low_threshold, allow_low_quality_matches=False): + """ + Args: + high_threshold (float): quality values greater than or equal to + this value are candidate matches. + low_threshold (float): a lower quality threshold used to stratify + matches into three levels: + 1) matches >= high_threshold + 2) BETWEEN_THRESHOLDS matches in [low_threshold, high_threshold) + 3) BELOW_LOW_THRESHOLD matches in [0, low_threshold) + allow_low_quality_matches (bool): if True, produce additional matches + for predictions that have only low-quality match candidates. See + set_low_quality_matches_ for more details. + """ + assert low_threshold <= high_threshold + self.high_threshold = high_threshold + self.low_threshold = low_threshold + self.allow_low_quality_matches = allow_low_quality_matches + + def __call__(self, match_quality_matrix): + """ + Args: + match_quality_matrix (Tensor[float]): an MxN tensor, containing the + pairwise quality between M ground-truth elements and N predicted elements. + + Returns: + matches (Tensor[int64]): an N tensor where N[i] is a matched gt in + [0, M - 1] or a negative value indicating that prediction i could not + be matched. + """ + if match_quality_matrix.numel() == 0: + # empty targets or proposals not supported during training + if match_quality_matrix.shape[0] == 0: + raise ValueError( + "No ground-truth boxes available for one of the images " + "during training") + else: + raise ValueError( + "No proposal boxes available for one of the images " + "during training") + + # match_quality_matrix is M (gt) x N (predicted) + # Max over gt elements (dim 0) to find best gt candidate for each prediction + matched_vals, matches = match_quality_matrix.max(dim=0) + if self.allow_low_quality_matches: + all_matches = matches.clone() + + # Assign candidate matches with low quality to negative (unassigned) values + below_low_threshold = matched_vals < self.low_threshold + between_thresholds = (matched_vals >= self.low_threshold) & ( + matched_vals < self.high_threshold + ) + matches[below_low_threshold] = Matcher.BELOW_LOW_THRESHOLD + matches[between_thresholds] = Matcher.BETWEEN_THRESHOLDS + + if self.allow_low_quality_matches: + self.set_low_quality_matches_(matches, all_matches, match_quality_matrix) + + return matches + + def set_low_quality_matches_(self, matches, all_matches, match_quality_matrix): + """ + Produce additional matches for predictions that have only low-quality matches. + Specifically, for each ground-truth find the set of predictions that have + maximum overlap with it (including ties); for each prediction in that set, if + it is unmatched, then match it to the ground-truth with which it has the highest + quality value. + """ + # For each gt, find the prediction with which it has highest quality + highest_quality_foreach_gt, _ = match_quality_matrix.max(dim=1) + # Find highest quality match available, even if it is low, including ties + gt_pred_pairs_of_highest_quality = torch.nonzero( + match_quality_matrix == highest_quality_foreach_gt[:, None] + ) + # Example gt_pred_pairs_of_highest_quality: + # tensor([[ 0, 39796], + # [ 1, 32055], + # [ 1, 32070], + # [ 2, 39190], + # [ 2, 40255], + # [ 3, 40390], + # [ 3, 41455], + # [ 4, 45470], + # [ 5, 45325], + # [ 5, 46390]]) + # Each row is a (gt index, prediction index) + # Note how gt items 1, 2, 3, and 5 each have two ties + + pred_inds_to_update = gt_pred_pairs_of_highest_quality[:, 1] + matches[pred_inds_to_update] = all_matches[pred_inds_to_update] diff --git a/maskrcnn_benchmark/modeling/poolers.py b/maskrcnn_benchmark/modeling/poolers.py new file mode 100644 index 0000000000000000000000000000000000000000..9b3524d2052493fda0baad3f621f192d253636b0 --- /dev/null +++ b/maskrcnn_benchmark/modeling/poolers.py @@ -0,0 +1,133 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import torch +import torch.nn.functional as F +from torch import nn + +from maskrcnn_benchmark.layers import ROIAlign + +from .utils import cat + + +class LevelMapper(object): + """Determine which FPN level each RoI in a set of RoIs should map to based + on the heuristic in the FPN paper. + """ + + def __init__(self, k_min, k_max, canonical_scale=224, canonical_level=4, eps=1e-6): + """ + Arguments: + k_min (int) + k_max (int) + canonical_scale (int) + canonical_level (int) + eps (float) + """ + self.k_min = k_min + self.k_max = k_max + self.s0 = canonical_scale + self.lvl0 = canonical_level + self.eps = eps + + def __call__(self, boxlists): + """ + Arguments: + boxlists (list[BoxList]) + """ + # Compute level ids + s = torch.sqrt(cat([boxlist.area() for boxlist in boxlists])) + + # Eqn.(1) in FPN paper + target_lvls = torch.floor(self.lvl0 + torch.log2(s / self.s0 + self.eps)) + target_lvls = torch.clamp(target_lvls, min=self.k_min, max=self.k_max) + return target_lvls.to(torch.int64) - self.k_min + + +class Pooler(nn.Module): + """ + Pooler for Detection with or without FPN. + It currently hard-code ROIAlign in the implementation, + but that can be made more generic later on. + Also, the requirement of passing the scales is not strictly necessary, as they + can be inferred from the size of the feature map / size of original image, + which is available thanks to the BoxList. + """ + + def __init__(self, output_size, scales, sampling_ratio): + """ + Arguments: + output_size (list[tuple[int]] or list[int]): output size for the pooled region + scales (list[float]): scales for each Pooler + sampling_ratio (int): sampling ratio for ROIAlign + """ + super(Pooler, self).__init__() + poolers = [] + for scale in scales: + poolers.append( + ROIAlign( + output_size, spatial_scale=scale, sampling_ratio=sampling_ratio + ) + ) + self.poolers = nn.ModuleList(poolers) + self.output_size = output_size + # get the levels in the feature map by leveraging the fact that the network always + # downsamples by a factor of 2 at each level. + lvl_min = -torch.log2(torch.tensor(scales[0], dtype=torch.float32)).item() + lvl_max = -torch.log2(torch.tensor(scales[-1], dtype=torch.float32)).item() + self.map_levels = LevelMapper(lvl_min, lvl_max) + + def convert_to_roi_format(self, boxes): + concat_boxes = cat([b.bbox for b in boxes], dim=0) + device, dtype = concat_boxes.device, concat_boxes.dtype + ids = cat( + [ + torch.full((len(b), 1), i, dtype=dtype, device=device) + for i, b in enumerate(boxes) + ], + dim=0, + ) + rois = torch.cat([ids, concat_boxes], dim=1) + return rois + + def forward(self, x, boxes): + """ + Arguments: + x (list[Tensor]): feature maps for each level + boxes (list[BoxList]): boxes to be used to perform the pooling operation. + Returns: + result (Tensor) + """ + num_levels = len(self.poolers) + rois = self.convert_to_roi_format(boxes) + if num_levels == 1: + return self.poolers[0](x[0], rois) + + levels = self.map_levels(boxes) + + num_rois = len(rois) + num_channels = x[0].shape[1] + output_size = self.output_size[0] + + dtype, device = x[0].dtype, x[0].device + result = torch.zeros( + (num_rois, num_channels, output_size, output_size), + dtype=dtype, + device=device, + ) + for level, (per_level_feature, pooler) in enumerate(zip(x, self.poolers)): + idx_in_level = torch.nonzero(levels == level).squeeze(1) + rois_per_level = rois[idx_in_level] + result[idx_in_level] = pooler(per_level_feature, rois_per_level) + + return result + + +def make_pooler(cfg, head_name): + resolution = cfg.MODEL[head_name].POOLER_RESOLUTION + scales = cfg.MODEL[head_name].POOLER_SCALES + sampling_ratio = cfg.MODEL[head_name].POOLER_SAMPLING_RATIO + pooler = Pooler( + output_size=(resolution, resolution), + scales=scales, + sampling_ratio=sampling_ratio, + ) + return pooler diff --git a/maskrcnn_benchmark/modeling/registry.py b/maskrcnn_benchmark/modeling/registry.py new file mode 100644 index 0000000000000000000000000000000000000000..e14fb118c458d0ba97d2a699be3004c6bdd3913c --- /dev/null +++ b/maskrcnn_benchmark/modeling/registry.py @@ -0,0 +1,12 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. + +from maskrcnn_benchmark.utils.registry import Registry + +BACKBONES = Registry() +RPN_HEADS = Registry() +ROI_BOX_FEATURE_EXTRACTORS = Registry() +ROI_BOX_PREDICTOR = Registry() +ROI_KEYPOINT_FEATURE_EXTRACTORS = Registry() +ROI_KEYPOINT_PREDICTOR = Registry() +ROI_MASK_FEATURE_EXTRACTORS = Registry() +ROI_MASK_PREDICTOR = Registry() diff --git a/maskrcnn_benchmark/modeling/roi_heads/__init__.py b/maskrcnn_benchmark/modeling/roi_heads/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/maskrcnn_benchmark/modeling/roi_heads/box_head/__init__.py b/maskrcnn_benchmark/modeling/roi_heads/box_head/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/maskrcnn_benchmark/modeling/roi_heads/box_head/box_head.py b/maskrcnn_benchmark/modeling/roi_heads/box_head/box_head.py new file mode 100644 index 0000000000000000000000000000000000000000..482081b8de7431282c8a017cd34d965c8f355bb0 --- /dev/null +++ b/maskrcnn_benchmark/modeling/roi_heads/box_head/box_head.py @@ -0,0 +1,71 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import torch +from torch import nn + +from .roi_box_feature_extractors import make_roi_box_feature_extractor +from .roi_box_predictors import make_roi_box_predictor +from .inference import make_roi_box_post_processor +from .loss import make_roi_box_loss_evaluator + + +class ROIBoxHead(torch.nn.Module): + """ + Generic Box Head class. + """ + + def __init__(self, cfg, in_channels): + super(ROIBoxHead, self).__init__() + self.feature_extractor = make_roi_box_feature_extractor(cfg, in_channels) + self.predictor = make_roi_box_predictor( + cfg, self.feature_extractor.out_channels) + self.post_processor = make_roi_box_post_processor(cfg) + self.loss_evaluator = make_roi_box_loss_evaluator(cfg) + + def forward(self, features, proposals, targets=None): + """ + Arguments: + features (list[Tensor]): feature-maps from possibly several levels + proposals (list[BoxList]): proposal boxes + targets (list[BoxList], optional): the ground-truth targets. + + Returns: + x (Tensor): the result of the feature extractor + proposals (list[BoxList]): during training, the subsampled proposals + are returned. During testing, the predicted boxlists are returned + losses (dict[Tensor]): During training, returns the losses for the + head. During testing, returns an empty dict. + """ + + if self.training: + # Faster R-CNN subsamples during training the proposals with a fixed + # positive / negative ratio + with torch.no_grad(): + proposals = self.loss_evaluator.subsample(proposals, targets) + + # extract features that will be fed to the final classifier. The + # feature_extractor generally corresponds to the pooler + heads + x = self.feature_extractor(features, proposals) + # final classifier that converts the features into predictions + class_logits, box_regression = self.predictor(x) + + if not self.training: + result = self.post_processor((class_logits, box_regression), proposals) + return x, result, {} + + loss_classifier, loss_box_reg = self.loss_evaluator( + [class_logits], [box_regression] + ) + return ( + x, + proposals, + dict(loss_classifier=loss_classifier, loss_box_reg=loss_box_reg), + ) + + +def build_roi_box_head(cfg, in_channels): + """ + Constructs a new box head. + By default, uses ROIBoxHead, but if it turns out not to be enough, just register a new class + and make it a parameter in the config + """ + return ROIBoxHead(cfg, in_channels) diff --git a/maskrcnn_benchmark/modeling/roi_heads/box_head/inference.py b/maskrcnn_benchmark/modeling/roi_heads/box_head/inference.py new file mode 100644 index 0000000000000000000000000000000000000000..595a2e61620fbd345bc36060c43191792fc010ea --- /dev/null +++ b/maskrcnn_benchmark/modeling/roi_heads/box_head/inference.py @@ -0,0 +1,167 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import torch +import torch.nn.functional as F +from torch import nn + +from maskrcnn_benchmark.structures.bounding_box import BoxList +from maskrcnn_benchmark.structures.boxlist_ops import boxlist_nms +from maskrcnn_benchmark.structures.boxlist_ops import cat_boxlist +from maskrcnn_benchmark.modeling.box_coder import BoxCoder + + +class PostProcessor(nn.Module): + """ + From a set of classification scores, box regression and proposals, + computes the post-processed boxes, and applies NMS to obtain the + final results + """ + + def __init__( + self, + score_thresh=0.05, + nms=0.5, + detections_per_img=100, + box_coder=None, + cls_agnostic_bbox_reg=False + ): + """ + Arguments: + score_thresh (float) + nms (float) + detections_per_img (int) + box_coder (BoxCoder) + """ + super(PostProcessor, self).__init__() + self.score_thresh = score_thresh + self.nms = nms + self.detections_per_img = detections_per_img + if box_coder is None: + box_coder = BoxCoder(weights=(10., 10., 5., 5.)) + self.box_coder = box_coder + self.cls_agnostic_bbox_reg = cls_agnostic_bbox_reg + + def forward(self, x, boxes): + """ + Arguments: + x (tuple[tensor, tensor]): x contains the class logits + and the box_regression from the model. + boxes (list[BoxList]): bounding boxes that are used as + reference, one for ech image + + Returns: + results (list[BoxList]): one BoxList for each image, containing + the extra fields labels and scores + """ + class_logits, box_regression = x + class_prob = F.softmax(class_logits, -1) + + # TODO think about a representation of batch of boxes + image_shapes = [box.size for box in boxes] + boxes_per_image = [len(box) for box in boxes] + concat_boxes = torch.cat([a.bbox for a in boxes], dim=0) + + if self.cls_agnostic_bbox_reg: + box_regression = box_regression[:, -4:] + proposals = self.box_coder.decode( + box_regression.view(sum(boxes_per_image), -1), concat_boxes + ) + if self.cls_agnostic_bbox_reg: + proposals = proposals.repeat(1, class_prob.shape[1]) + + num_classes = class_prob.shape[1] + + proposals = proposals.split(boxes_per_image, dim=0) + class_prob = class_prob.split(boxes_per_image, dim=0) + + results = [] + for prob, boxes_per_img, image_shape in zip( + class_prob, proposals, image_shapes + ): + boxlist = self.prepare_boxlist(boxes_per_img, prob, image_shape) + boxlist = boxlist.clip_to_image(remove_empty=False) + boxlist = self.filter_results(boxlist, num_classes) + results.append(boxlist) + return results + + def prepare_boxlist(self, boxes, scores, image_shape): + """ + Returns BoxList from `boxes` and adds probability scores information + as an extra field + `boxes` has shape (#detections, 4 * #classes), where each row represents + a list of predicted bounding boxes for each of the object classes in the + dataset (including the background class). The detections in each row + originate from the same object proposal. + `scores` has shape (#detection, #classes), where each row represents a list + of object detection confidence scores for each of the object classes in the + dataset (including the background class). `scores[i, j]`` corresponds to the + box at `boxes[i, j * 4:(j + 1) * 4]`. + """ + boxes = boxes.reshape(-1, 4) + scores = scores.reshape(-1) + boxlist = BoxList(boxes, image_shape, mode="xyxy") + boxlist.add_field("scores", scores) + return boxlist + + def filter_results(self, boxlist, num_classes): + """Returns bounding-box detection results by thresholding on scores and + applying non-maximum suppression (NMS). + """ + # unwrap the boxlist to avoid additional overhead. + # if we had multi-class NMS, we could perform this directly on the boxlist + boxes = boxlist.bbox.reshape(-1, num_classes * 4) + scores = boxlist.get_field("scores").reshape(-1, num_classes) + + device = scores.device + result = [] + # Apply threshold on detection probabilities and apply NMS + # Skip j = 0, because it's the background class + inds_all = scores > self.score_thresh + for j in range(1, num_classes): + inds = inds_all[:, j].nonzero().squeeze(1) + scores_j = scores[inds, j] + boxes_j = boxes[inds, j * 4 : (j + 1) * 4] + boxlist_for_class = BoxList(boxes_j, boxlist.size, mode="xyxy") + boxlist_for_class.add_field("scores", scores_j) + boxlist_for_class = boxlist_nms( + boxlist_for_class, self.nms + ) + num_labels = len(boxlist_for_class) + boxlist_for_class.add_field( + "labels", torch.full((num_labels,), j, dtype=torch.int64, device=device) + ) + result.append(boxlist_for_class) + + result = cat_boxlist(result) + number_of_detections = len(result) + + # Limit to max_per_image detections **over all classes** + if number_of_detections > self.detections_per_img > 0: + cls_scores = result.get_field("scores") + image_thresh, _ = torch.kthvalue( + cls_scores.cpu(), number_of_detections - self.detections_per_img + 1 + ) + keep = cls_scores >= image_thresh.item() + keep = torch.nonzero(keep).squeeze(1) + result = result[keep] + return result + + +def make_roi_box_post_processor(cfg): + use_fpn = cfg.MODEL.ROI_HEADS.USE_FPN + + bbox_reg_weights = cfg.MODEL.ROI_HEADS.BBOX_REG_WEIGHTS + box_coder = BoxCoder(weights=bbox_reg_weights) + + score_thresh = cfg.MODEL.ROI_HEADS.SCORE_THRESH + nms_thresh = cfg.MODEL.ROI_HEADS.NMS + detections_per_img = cfg.MODEL.ROI_HEADS.DETECTIONS_PER_IMG + cls_agnostic_bbox_reg = cfg.MODEL.CLS_AGNOSTIC_BBOX_REG + + postprocessor = PostProcessor( + score_thresh, + nms_thresh, + detections_per_img, + box_coder, + cls_agnostic_bbox_reg + ) + return postprocessor diff --git a/maskrcnn_benchmark/modeling/roi_heads/box_head/loss.py b/maskrcnn_benchmark/modeling/roi_heads/box_head/loss.py new file mode 100644 index 0000000000000000000000000000000000000000..9f2771d029e6d027b29e60b83d268f03628d3a14 --- /dev/null +++ b/maskrcnn_benchmark/modeling/roi_heads/box_head/loss.py @@ -0,0 +1,193 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import torch +from torch.nn import functional as F + +from maskrcnn_benchmark.layers import smooth_l1_loss +from maskrcnn_benchmark.modeling.box_coder import BoxCoder +from maskrcnn_benchmark.modeling.matcher import Matcher +from maskrcnn_benchmark.structures.boxlist_ops import boxlist_iou +from maskrcnn_benchmark.modeling.balanced_positive_negative_sampler import ( + BalancedPositiveNegativeSampler +) +from maskrcnn_benchmark.modeling.utils import cat + + +class FastRCNNLossComputation(object): + """ + Computes the loss for Faster R-CNN. + Also supports FPN + """ + + def __init__( + self, + proposal_matcher, + fg_bg_sampler, + box_coder, + cls_agnostic_bbox_reg=False + ): + """ + Arguments: + proposal_matcher (Matcher) + fg_bg_sampler (BalancedPositiveNegativeSampler) + box_coder (BoxCoder) + """ + self.proposal_matcher = proposal_matcher + self.fg_bg_sampler = fg_bg_sampler + self.box_coder = box_coder + self.cls_agnostic_bbox_reg = cls_agnostic_bbox_reg + + def match_targets_to_proposals(self, proposal, target): + match_quality_matrix = boxlist_iou(target, proposal) + matched_idxs = self.proposal_matcher(match_quality_matrix) + # Fast RCNN only need "labels" field for selecting the targets + target = target.copy_with_fields("labels") + # get the targets corresponding GT for each proposal + # NB: need to clamp the indices because we can have a single + # GT in the image, and matched_idxs can be -2, which goes + # out of bounds + matched_targets = target[matched_idxs.clamp(min=0)] + matched_targets.add_field("matched_idxs", matched_idxs) + return matched_targets + + def prepare_targets(self, proposals, targets): + labels = [] + regression_targets = [] + for proposals_per_image, targets_per_image in zip(proposals, targets): + matched_targets = self.match_targets_to_proposals( + proposals_per_image, targets_per_image + ) + matched_idxs = matched_targets.get_field("matched_idxs") + + labels_per_image = matched_targets.get_field("labels") + labels_per_image = labels_per_image.to(dtype=torch.int64) + + # Label background (below the low threshold) + bg_inds = matched_idxs == Matcher.BELOW_LOW_THRESHOLD + labels_per_image[bg_inds] = 0 + + # Label ignore proposals (between low and high thresholds) + ignore_inds = matched_idxs == Matcher.BETWEEN_THRESHOLDS + labels_per_image[ignore_inds] = -1 # -1 is ignored by sampler + + # compute regression targets + regression_targets_per_image = self.box_coder.encode( + matched_targets.bbox, proposals_per_image.bbox + ) + + labels.append(labels_per_image) + regression_targets.append(regression_targets_per_image) + + return labels, regression_targets + + def subsample(self, proposals, targets): + """ + This method performs the positive/negative sampling, and return + the sampled proposals. + Note: this function keeps a state. + + Arguments: + proposals (list[BoxList]) + targets (list[BoxList]) + """ + + labels, regression_targets = self.prepare_targets(proposals, targets) + sampled_pos_inds, sampled_neg_inds = self.fg_bg_sampler(labels) + + proposals = list(proposals) + # add corresponding label and regression_targets information to the bounding boxes + for labels_per_image, regression_targets_per_image, proposals_per_image in zip( + labels, regression_targets, proposals + ): + proposals_per_image.add_field("labels", labels_per_image) + proposals_per_image.add_field( + "regression_targets", regression_targets_per_image + ) + + # distributed sampled proposals, that were obtained on all feature maps + # concatenated via the fg_bg_sampler, into individual feature map levels + for img_idx, (pos_inds_img, neg_inds_img) in enumerate( + zip(sampled_pos_inds, sampled_neg_inds) + ): + img_sampled_inds = torch.nonzero(pos_inds_img | neg_inds_img).squeeze(1) + proposals_per_image = proposals[img_idx][img_sampled_inds] + proposals[img_idx] = proposals_per_image + + self._proposals = proposals + return proposals + + def __call__(self, class_logits, box_regression): + """ + Computes the loss for Faster R-CNN. + This requires that the subsample method has been called beforehand. + + Arguments: + class_logits (list[Tensor]) + box_regression (list[Tensor]) + + Returns: + classification_loss (Tensor) + box_loss (Tensor) + """ + + class_logits = cat(class_logits, dim=0) + box_regression = cat(box_regression, dim=0) + device = class_logits.device + + if not hasattr(self, "_proposals"): + raise RuntimeError("subsample needs to be called before") + + proposals = self._proposals + + labels = cat([proposal.get_field("labels") for proposal in proposals], dim=0) + regression_targets = cat( + [proposal.get_field("regression_targets") for proposal in proposals], dim=0 + ) + + classification_loss = F.cross_entropy(class_logits, labels) + + # get indices that correspond to the regression targets for + # the corresponding ground truth labels, to be used with + # advanced indexing + sampled_pos_inds_subset = torch.nonzero(labels > 0).squeeze(1) + labels_pos = labels[sampled_pos_inds_subset] + if self.cls_agnostic_bbox_reg: + map_inds = torch.tensor([4, 5, 6, 7], device=device) + else: + map_inds = 4 * labels_pos[:, None] + torch.tensor( + [0, 1, 2, 3], device=device) + + box_loss = smooth_l1_loss( + box_regression[sampled_pos_inds_subset[:, None], map_inds], + regression_targets[sampled_pos_inds_subset], + size_average=False, + beta=1, + ) + box_loss = box_loss / labels.numel() + + return classification_loss, box_loss + + +def make_roi_box_loss_evaluator(cfg): + matcher = Matcher( + cfg.MODEL.ROI_HEADS.FG_IOU_THRESHOLD, + cfg.MODEL.ROI_HEADS.BG_IOU_THRESHOLD, + allow_low_quality_matches=False, + ) + + bbox_reg_weights = cfg.MODEL.ROI_HEADS.BBOX_REG_WEIGHTS + box_coder = BoxCoder(weights=bbox_reg_weights) + + fg_bg_sampler = BalancedPositiveNegativeSampler( + cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE, cfg.MODEL.ROI_HEADS.POSITIVE_FRACTION + ) + + cls_agnostic_bbox_reg = cfg.MODEL.CLS_AGNOSTIC_BBOX_REG + + loss_evaluator = FastRCNNLossComputation( + matcher, + fg_bg_sampler, + box_coder, + cls_agnostic_bbox_reg + ) + + return loss_evaluator diff --git a/maskrcnn_benchmark/modeling/roi_heads/box_head/roi_box_feature_extractors.py b/maskrcnn_benchmark/modeling/roi_heads/box_head/roi_box_feature_extractors.py new file mode 100644 index 0000000000000000000000000000000000000000..e477147462937f72280a7ec73cf8226647f47bf7 --- /dev/null +++ b/maskrcnn_benchmark/modeling/roi_heads/box_head/roi_box_feature_extractors.py @@ -0,0 +1,151 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import torch +from torch import nn +from torch.nn import functional as F + +from maskrcnn_benchmark.modeling import registry +from maskrcnn_benchmark.modeling.backbone import resnet +from maskrcnn_benchmark.modeling.poolers import Pooler +from maskrcnn_benchmark.modeling.make_layers import group_norm +from maskrcnn_benchmark.modeling.make_layers import make_fc + + +@registry.ROI_BOX_FEATURE_EXTRACTORS.register("ResNet50Conv5ROIFeatureExtractor") +class ResNet50Conv5ROIFeatureExtractor(nn.Module): + def __init__(self, config, in_channels): + super(ResNet50Conv5ROIFeatureExtractor, self).__init__() + + resolution = config.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION + scales = config.MODEL.ROI_BOX_HEAD.POOLER_SCALES + sampling_ratio = config.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO + pooler = Pooler( + output_size=(resolution, resolution), + scales=scales, + sampling_ratio=sampling_ratio, + ) + + stage = resnet.StageSpec(index=4, block_count=3, return_features=False) + head = resnet.ResNetHead( + block_module=config.MODEL.RESNETS.TRANS_FUNC, + stages=(stage,), + num_groups=config.MODEL.RESNETS.NUM_GROUPS, + width_per_group=config.MODEL.RESNETS.WIDTH_PER_GROUP, + stride_in_1x1=config.MODEL.RESNETS.STRIDE_IN_1X1, + stride_init=None, + res2_out_channels=config.MODEL.RESNETS.RES2_OUT_CHANNELS, + dilation=config.MODEL.RESNETS.RES5_DILATION + ) + + self.pooler = pooler + self.head = head + self.out_channels = head.out_channels + + def forward(self, x, proposals): + x = self.pooler(x, proposals) + x = self.head(x) + return x + + +@registry.ROI_BOX_FEATURE_EXTRACTORS.register("FPN2MLPFeatureExtractor") +class FPN2MLPFeatureExtractor(nn.Module): + """ + Heads for FPN for classification + """ + + def __init__(self, cfg, in_channels): + super(FPN2MLPFeatureExtractor, self).__init__() + + resolution = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION + scales = cfg.MODEL.ROI_BOX_HEAD.POOLER_SCALES + sampling_ratio = cfg.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO + pooler = Pooler( + output_size=(resolution, resolution), + scales=scales, + sampling_ratio=sampling_ratio, + ) + input_size = in_channels * resolution ** 2 + representation_size = cfg.MODEL.ROI_BOX_HEAD.MLP_HEAD_DIM + use_gn = cfg.MODEL.ROI_BOX_HEAD.USE_GN + self.pooler = pooler + self.fc6 = make_fc(input_size, representation_size, use_gn) + self.fc7 = make_fc(representation_size, representation_size, use_gn) + self.out_channels = representation_size + + def forward(self, x, proposals): + x = self.pooler(x, proposals) + x = x.view(x.size(0), -1) + + x = F.relu(self.fc6(x)) + x = F.relu(self.fc7(x)) + + return x + + +@registry.ROI_BOX_FEATURE_EXTRACTORS.register("FPNXconv1fcFeatureExtractor") +class FPNXconv1fcFeatureExtractor(nn.Module): + """ + Heads for FPN for classification + """ + + def __init__(self, cfg, in_channels): + super(FPNXconv1fcFeatureExtractor, self).__init__() + + resolution = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION + scales = cfg.MODEL.ROI_BOX_HEAD.POOLER_SCALES + sampling_ratio = cfg.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO + pooler = Pooler( + output_size=(resolution, resolution), + scales=scales, + sampling_ratio=sampling_ratio, + ) + self.pooler = pooler + + use_gn = cfg.MODEL.ROI_BOX_HEAD.USE_GN + conv_head_dim = cfg.MODEL.ROI_BOX_HEAD.CONV_HEAD_DIM + num_stacked_convs = cfg.MODEL.ROI_BOX_HEAD.NUM_STACKED_CONVS + dilation = cfg.MODEL.ROI_BOX_HEAD.DILATION + + xconvs = [] + for ix in range(num_stacked_convs): + xconvs.append( + nn.Conv2d( + in_channels, + conv_head_dim, + kernel_size=3, + stride=1, + padding=dilation, + dilation=dilation, + bias=False if use_gn else True + ) + ) + in_channels = conv_head_dim + if use_gn: + xconvs.append(group_norm(in_channels)) + xconvs.append(nn.ReLU(inplace=True)) + + self.add_module("xconvs", nn.Sequential(*xconvs)) + for modules in [self.xconvs,]: + for l in modules.modules(): + if isinstance(l, nn.Conv2d): + torch.nn.init.normal_(l.weight, std=0.01) + if not use_gn: + torch.nn.init.constant_(l.bias, 0) + + input_size = conv_head_dim * resolution ** 2 + representation_size = cfg.MODEL.ROI_BOX_HEAD.MLP_HEAD_DIM + self.fc6 = make_fc(input_size, representation_size, use_gn=False) + self.out_channels = representation_size + + def forward(self, x, proposals): + x = self.pooler(x, proposals) + x = self.xconvs(x) + x = x.view(x.size(0), -1) + x = F.relu(self.fc6(x)) + return x + + +def make_roi_box_feature_extractor(cfg, in_channels): + func = registry.ROI_BOX_FEATURE_EXTRACTORS[ + cfg.MODEL.ROI_BOX_HEAD.FEATURE_EXTRACTOR + ] + return func(cfg, in_channels) diff --git a/maskrcnn_benchmark/modeling/roi_heads/box_head/roi_box_predictors.py b/maskrcnn_benchmark/modeling/roi_heads/box_head/roi_box_predictors.py new file mode 100644 index 0000000000000000000000000000000000000000..66ee4ace585cff5ea2933553d3e800f03757eba9 --- /dev/null +++ b/maskrcnn_benchmark/modeling/roi_heads/box_head/roi_box_predictors.py @@ -0,0 +1,62 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +from maskrcnn_benchmark.modeling import registry +from torch import nn + + +@registry.ROI_BOX_PREDICTOR.register("FastRCNNPredictor") +class FastRCNNPredictor(nn.Module): + def __init__(self, config, in_channels): + super(FastRCNNPredictor, self).__init__() + assert in_channels is not None + + num_inputs = in_channels + + num_classes = config.MODEL.ROI_BOX_HEAD.NUM_CLASSES + self.avgpool = nn.AdaptiveAvgPool2d(1) + self.cls_score = nn.Linear(num_inputs, num_classes) + num_bbox_reg_classes = 2 if config.MODEL.CLS_AGNOSTIC_BBOX_REG else num_classes + self.bbox_pred = nn.Linear(num_inputs, num_bbox_reg_classes * 4) + + nn.init.normal_(self.cls_score.weight, mean=0, std=0.01) + nn.init.constant_(self.cls_score.bias, 0) + + nn.init.normal_(self.bbox_pred.weight, mean=0, std=0.001) + nn.init.constant_(self.bbox_pred.bias, 0) + + def forward(self, x): + x = self.avgpool(x) + x = x.view(x.size(0), -1) + cls_logit = self.cls_score(x) + bbox_pred = self.bbox_pred(x) + return cls_logit, bbox_pred + + +@registry.ROI_BOX_PREDICTOR.register("FPNPredictor") +class FPNPredictor(nn.Module): + def __init__(self, cfg, in_channels): + super(FPNPredictor, self).__init__() + num_classes = cfg.MODEL.ROI_BOX_HEAD.NUM_CLASSES + representation_size = in_channels + + self.cls_score = nn.Linear(representation_size, num_classes) + num_bbox_reg_classes = 2 if cfg.MODEL.CLS_AGNOSTIC_BBOX_REG else num_classes + self.bbox_pred = nn.Linear(representation_size, num_bbox_reg_classes * 4) + + nn.init.normal_(self.cls_score.weight, std=0.01) + nn.init.normal_(self.bbox_pred.weight, std=0.001) + for l in [self.cls_score, self.bbox_pred]: + nn.init.constant_(l.bias, 0) + + def forward(self, x): + if x.ndimension() == 4: + assert list(x.shape[2:]) == [1, 1] + x = x.view(x.size(0), -1) + scores = self.cls_score(x) + bbox_deltas = self.bbox_pred(x) + + return scores, bbox_deltas + + +def make_roi_box_predictor(cfg, in_channels): + func = registry.ROI_BOX_PREDICTOR[cfg.MODEL.ROI_BOX_HEAD.PREDICTOR] + return func(cfg, in_channels) diff --git a/maskrcnn_benchmark/modeling/roi_heads/heatmap_head/__init__.py b/maskrcnn_benchmark/modeling/roi_heads/heatmap_head/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/maskrcnn_benchmark/modeling/roi_heads/heatmap_head/heatmap_head.py b/maskrcnn_benchmark/modeling/roi_heads/heatmap_head/heatmap_head.py new file mode 100644 index 0000000000000000000000000000000000000000..db994956da921fb5d159fc48fbe00f54e8458636 --- /dev/null +++ b/maskrcnn_benchmark/modeling/roi_heads/heatmap_head/heatmap_head.py @@ -0,0 +1,124 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class HeatmapHead(nn.Module): + def __init__(self, cfg, in_channels): + super().__init__() + self.num_classes = cfg.MODEL.DENSITY_HEAD.NUM_CLASSES + self.level = cfg.MODEL.DENSITY_HEAD.FPN_LEVEL + self.mode = cfg.MODEL.DENSITY_HEAD.INTERPOLATE_MODE + + self.heatmap_convs = nn.Sequential( + nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1), + nn.ReLU(), + nn.Conv2d(in_channels, self.num_classes, kernel_size=1), + nn.ReLU(), + ) + self.convs_1x1 = nn.ModuleList() + for i in range(4): + self.convs_1x1.append( + nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0), + ) + self.density_criterion = nn.MSELoss(reduction='sum') + self.reset_parameters() + + def reset_parameters(self): + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.normal_(m.weight, std=0.001) + nn.init.zeros_(m.bias) + + def forward(self, features, detections, targets=None): + """ + Args: + features: Feature pyramid from FPN: (p2, p3, p4, p5, p6) + detections: list[Boxlist] + targets: list[Boxlist] + Returns: + predication, detections, loss dict + """ + if self.level == 1: + return self.compat_forward(features, detections, targets) + + features = list(features) + conv1x1_index = 0 + for i in range(0, self.level): + feature = F.interpolate(features[i], size=features[i + 1].shape[2:], mode=self.mode, align_corners=False) + feature = self.convs_1x1[conv1x1_index](feature) + conv1x1_index += 1 + features[i + 1] = feature + features[i + 1] + for i in range(len(features) - 1, self.level, -1): + feature = F.interpolate(features[i], size=features[i - 1].shape[2:], mode=self.mode, align_corners=False) + feature = self.convs_1x1[conv1x1_index](feature) + conv1x1_index += 1 + features[i - 1] = feature + features[i - 1] + + feature = features[self.level] + + density_logits = self.heatmap_convs(feature) + + if not self.training: + density = torch.sum(density_logits, dim=(2, 3)) # (batch, num_classes) + for i, detection in enumerate(detections): + detection.add_field('density', density[i]) + return feature, density_logits, detections, {} + + gt_density = [] + for i, target in enumerate(targets): + gt_heatmap = target.get_field('heatmap').heatmap + if tuple(gt_heatmap.shape) != tuple(density_logits[i].shape): + gt_heatmap = F.pad(gt_heatmap, [0, density_logits[i].shape[2] - gt_heatmap.shape[2], + 0, density_logits[i].shape[1] - gt_heatmap.shape[1]]) + gt_density.append(gt_heatmap) + + gt_density = torch.stack(gt_density, dim=0) + loss_density = self.density_criterion(density_logits, gt_density) + return feature, density_logits, detections, dict(loss_density=loss_density) + + def compat_forward(self, features, detections, targets=None): + """ + Args: + features: Feature pyramid from FPN: (p2, p3, p4, p5, p6) + detections: list[Boxlist] + targets: list[Boxlist] + Returns: + predication, detections, loss dict + """ + p2, p3, p4, p5, p6 = features + mode = 'bilinear' + p6 = F.interpolate(p6, size=p5.shape[2:], mode=mode, align_corners=False) + p5 = p5 + self.convs_1x1[0](p6) + + p5 = F.interpolate(p5, size=p4.shape[2:], mode=mode, align_corners=False) + p4 = p4 + self.convs_1x1[1](p5) + + p4 = F.interpolate(p4, size=p3.shape[2:], mode=mode, align_corners=False) + p2 = F.interpolate(p2, size=p3.shape[2:], mode=mode, align_corners=False) + + feature = self.convs_1x1[2](p4) + self.convs_1x1[3](p2) + p3 + + density_logits = self.heatmap_convs(feature) + + if not self.training: + density = torch.sum(density_logits, dim=(2, 3)) # (batch, num_classes) + for i, detection in enumerate(detections): + detection.add_field('density', density[i]) + return feature, density_logits, detections, {} + + gt_density = [] + for i, target in enumerate(targets): + gt_heatmap = target.get_field('heatmap').heatmap + if tuple(gt_heatmap.shape) != tuple(density_logits[i].shape): + gt_heatmap = F.pad(gt_heatmap, [0, density_logits[i].shape[2] - gt_heatmap.shape[2], + 0, density_logits[i].shape[1] - gt_heatmap.shape[1]]) + gt_density.append(gt_heatmap) + + gt_density = torch.stack(gt_density, dim=0) + loss_density = self.density_criterion(density_logits, gt_density) + return feature, density_logits, detections, dict(loss_density=loss_density) + + +def build_heatmap_head(cfg, in_channels): + return HeatmapHead(cfg, in_channels=in_channels) diff --git a/maskrcnn_benchmark/modeling/roi_heads/keypoint_head/__init__.py b/maskrcnn_benchmark/modeling/roi_heads/keypoint_head/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/maskrcnn_benchmark/modeling/roi_heads/keypoint_head/inference.py b/maskrcnn_benchmark/modeling/roi_heads/keypoint_head/inference.py new file mode 100644 index 0000000000000000000000000000000000000000..1f6fe2be3efb883ebd7fe86ccfd00377f1e55c5c --- /dev/null +++ b/maskrcnn_benchmark/modeling/roi_heads/keypoint_head/inference.py @@ -0,0 +1,125 @@ +import torch +from torch import nn + + +class KeypointPostProcessor(nn.Module): + def __init__(self, keypointer=None): + super(KeypointPostProcessor, self).__init__() + self.keypointer = keypointer + + def forward(self, x, boxes): + mask_prob = x + + scores = None + if self.keypointer: + mask_prob, scores = self.keypointer(x, boxes) + + assert len(boxes) == 1, "Only non-batched inference supported for now" + boxes_per_image = [box.bbox.size(0) for box in boxes] + mask_prob = mask_prob.split(boxes_per_image, dim=0) + scores = scores.split(boxes_per_image, dim=0) + + results = [] + for prob, box, score in zip(mask_prob, boxes, scores): + bbox = BoxList(box.bbox, box.size, mode="xyxy") + for field in box.fields(): + bbox.add_field(field, box.get_field(field)) + prob = PersonKeypoints(prob, box.size) + prob.add_field("logits", score) + bbox.add_field("keypoints", prob) + results.append(bbox) + + return results + + +# TODO remove and use only the Keypointer +import numpy as np +import cv2 + + +def heatmaps_to_keypoints(maps, rois): + """Extract predicted keypoint locations from heatmaps. Output has shape + (#rois, 4, #keypoints) with the 4 rows corresponding to (x, y, logit, prob) + for each keypoint. + """ + # This function converts a discrete image coordinate in a HEATMAP_SIZE x + # HEATMAP_SIZE image to a continuous keypoint coordinate. We maintain + # consistency with keypoints_to_heatmap_labels by using the conversion from + # Heckbert 1990: c = d + 0.5, where d is a discrete coordinate and c is a + # continuous coordinate. + offset_x = rois[:, 0] + offset_y = rois[:, 1] + + widths = rois[:, 2] - rois[:, 0] + heights = rois[:, 3] - rois[:, 1] + widths = np.maximum(widths, 1) + heights = np.maximum(heights, 1) + widths_ceil = np.ceil(widths) + heights_ceil = np.ceil(heights) + + # NCHW to NHWC for use with OpenCV + maps = np.transpose(maps, [0, 2, 3, 1]) + min_size = 0 # cfg.KRCNN.INFERENCE_MIN_SIZE + num_keypoints = maps.shape[3] + xy_preds = np.zeros((len(rois), 3, num_keypoints), dtype=np.float32) + end_scores = np.zeros((len(rois), num_keypoints), dtype=np.float32) + for i in range(len(rois)): + if min_size > 0: + roi_map_width = int(np.maximum(widths_ceil[i], min_size)) + roi_map_height = int(np.maximum(heights_ceil[i], min_size)) + else: + roi_map_width = widths_ceil[i] + roi_map_height = heights_ceil[i] + width_correction = widths[i] / roi_map_width + height_correction = heights[i] / roi_map_height + roi_map = cv2.resize( + maps[i], (roi_map_width, roi_map_height), interpolation=cv2.INTER_CUBIC + ) + # Bring back to CHW + roi_map = np.transpose(roi_map, [2, 0, 1]) + # roi_map_probs = scores_to_probs(roi_map.copy()) + w = roi_map.shape[2] + pos = roi_map.reshape(num_keypoints, -1).argmax(axis=1) + x_int = pos % w + y_int = (pos - x_int) // w + # assert (roi_map_probs[k, y_int, x_int] == + # roi_map_probs[k, :, :].max()) + x = (x_int + 0.5) * width_correction + y = (y_int + 0.5) * height_correction + xy_preds[i, 0, :] = x + offset_x[i] + xy_preds[i, 1, :] = y + offset_y[i] + xy_preds[i, 2, :] = 1 + end_scores[i, :] = roi_map[np.arange(num_keypoints), y_int, x_int] + + return np.transpose(xy_preds, [0, 2, 1]), end_scores + + +from maskrcnn_benchmark.structures.bounding_box import BoxList +from maskrcnn_benchmark.structures.keypoint import PersonKeypoints + + +class Keypointer(object): + """ + Projects a set of masks in an image on the locations + specified by the bounding boxes + """ + + def __init__(self, padding=0): + self.padding = padding + + def __call__(self, masks, boxes): + # TODO do this properly + if isinstance(boxes, BoxList): + boxes = [boxes] + assert len(boxes) == 1 + + result, scores = heatmaps_to_keypoints( + masks.detach().cpu().numpy(), boxes[0].bbox.cpu().numpy() + ) + return torch.from_numpy(result).to(masks.device), torch.as_tensor(scores, device=masks.device) + + +def make_roi_keypoint_post_processor(cfg): + keypointer = Keypointer() + keypoint_post_processor = KeypointPostProcessor(keypointer) + return keypoint_post_processor diff --git a/maskrcnn_benchmark/modeling/roi_heads/keypoint_head/keypoint_head.py b/maskrcnn_benchmark/modeling/roi_heads/keypoint_head/keypoint_head.py new file mode 100644 index 0000000000000000000000000000000000000000..5a842cad3f37343c7906cc30e86388513f9ba521 --- /dev/null +++ b/maskrcnn_benchmark/modeling/roi_heads/keypoint_head/keypoint_head.py @@ -0,0 +1,51 @@ +import torch + +from .roi_keypoint_feature_extractors import make_roi_keypoint_feature_extractor +from .roi_keypoint_predictors import make_roi_keypoint_predictor +from .inference import make_roi_keypoint_post_processor +from .loss import make_roi_keypoint_loss_evaluator + + +class ROIKeypointHead(torch.nn.Module): + def __init__(self, cfg, in_channels): + super(ROIKeypointHead, self).__init__() + self.cfg = cfg.clone() + self.feature_extractor = make_roi_keypoint_feature_extractor(cfg, in_channels) + self.predictor = make_roi_keypoint_predictor( + cfg, self.feature_extractor.out_channels) + self.post_processor = make_roi_keypoint_post_processor(cfg) + self.loss_evaluator = make_roi_keypoint_loss_evaluator(cfg) + + def forward(self, features, proposals, targets=None): + """ + Arguments: + features (list[Tensor]): feature-maps from possibly several levels + proposals (list[BoxList]): proposal boxes + targets (list[BoxList], optional): the ground-truth targets. + + Returns: + x (Tensor): the result of the feature extractor + proposals (list[BoxList]): during training, the original proposals + are returned. During testing, the predicted boxlists are returned + with the `mask` field set + losses (dict[Tensor]): During training, returns the losses for the + head. During testing, returns an empty dict. + """ + if self.training: + with torch.no_grad(): + proposals = self.loss_evaluator.subsample(proposals, targets) + + x = self.feature_extractor(features, proposals) + kp_logits = self.predictor(x) + + if not self.training: + result = self.post_processor(kp_logits, proposals) + return x, result, {} + + loss_kp = self.loss_evaluator(proposals, kp_logits) + + return x, proposals, dict(loss_kp=loss_kp) + + +def build_roi_keypoint_head(cfg, in_channels): + return ROIKeypointHead(cfg, in_channels) diff --git a/maskrcnn_benchmark/modeling/roi_heads/keypoint_head/loss.py b/maskrcnn_benchmark/modeling/roi_heads/keypoint_head/loss.py new file mode 100644 index 0000000000000000000000000000000000000000..6ba3a72a4efc9834f7b85a5923ae6ebe3abcb464 --- /dev/null +++ b/maskrcnn_benchmark/modeling/roi_heads/keypoint_head/loss.py @@ -0,0 +1,183 @@ +import torch +from torch.nn import functional as F + +from maskrcnn_benchmark.modeling.matcher import Matcher + +from maskrcnn_benchmark.modeling.balanced_positive_negative_sampler import ( + BalancedPositiveNegativeSampler, +) +from maskrcnn_benchmark.structures.boxlist_ops import boxlist_iou +from maskrcnn_benchmark.modeling.utils import cat +from maskrcnn_benchmark.layers import smooth_l1_loss +from maskrcnn_benchmark.structures.boxlist_ops import cat_boxlist + +from maskrcnn_benchmark.structures.keypoint import keypoints_to_heat_map + + +def project_keypoints_to_heatmap(keypoints, proposals, discretization_size): + proposals = proposals.convert("xyxy") + return keypoints_to_heat_map( + keypoints.keypoints, proposals.bbox, discretization_size + ) + + +def cat_boxlist_with_keypoints(boxlists): + assert all(boxlist.has_field("keypoints") for boxlist in boxlists) + + kp = [boxlist.get_field("keypoints").keypoints for boxlist in boxlists] + kp = cat(kp, 0) + + fields = boxlists[0].get_fields() + fields = [field for field in fields if field != "keypoints"] + + boxlists = [boxlist.copy_with_fields(fields) for boxlist in boxlists] + boxlists = cat_boxlist(boxlists) + boxlists.add_field("keypoints", kp) + return boxlists + + +def _within_box(points, boxes): + """Validate which keypoints are contained inside a given box. + points: NxKx2 + boxes: Nx4 + output: NxK + """ + x_within = (points[..., 0] >= boxes[:, 0, None]) & ( + points[..., 0] <= boxes[:, 2, None] + ) + y_within = (points[..., 1] >= boxes[:, 1, None]) & ( + points[..., 1] <= boxes[:, 3, None] + ) + return x_within & y_within + + +class KeypointRCNNLossComputation(object): + def __init__(self, proposal_matcher, fg_bg_sampler, discretization_size): + """ + Arguments: + proposal_matcher (Matcher) + fg_bg_sampler (BalancedPositiveNegativeSampler) + discretization_size (int) + """ + self.proposal_matcher = proposal_matcher + self.fg_bg_sampler = fg_bg_sampler + self.discretization_size = discretization_size + + def match_targets_to_proposals(self, proposal, target): + match_quality_matrix = boxlist_iou(target, proposal) + matched_idxs = self.proposal_matcher(match_quality_matrix) + # Keypoint RCNN needs "labels" and "keypoints "fields for creating the targets + target = target.copy_with_fields(["labels", "keypoints"]) + # get the targets corresponding GT for each proposal + # NB: need to clamp the indices because we can have a single + # GT in the image, and matched_idxs can be -2, which goes + # out of bounds + matched_targets = target[matched_idxs.clamp(min=0)] + matched_targets.add_field("matched_idxs", matched_idxs) + return matched_targets + + def prepare_targets(self, proposals, targets): + labels = [] + keypoints = [] + for proposals_per_image, targets_per_image in zip(proposals, targets): + matched_targets = self.match_targets_to_proposals( + proposals_per_image, targets_per_image + ) + matched_idxs = matched_targets.get_field("matched_idxs") + + labels_per_image = matched_targets.get_field("labels") + labels_per_image = labels_per_image.to(dtype=torch.int64) + + # this can probably be removed, but is left here for clarity + # and completeness + # TODO check if this is the right one, as BELOW_THRESHOLD + neg_inds = matched_idxs == Matcher.BELOW_LOW_THRESHOLD + labels_per_image[neg_inds] = 0 + + keypoints_per_image = matched_targets.get_field("keypoints") + within_box = _within_box( + keypoints_per_image.keypoints, matched_targets.bbox + ) + vis_kp = keypoints_per_image.keypoints[..., 2] > 0 + is_visible = (within_box & vis_kp).sum(1) > 0 + + labels_per_image[~is_visible] = -1 + + labels.append(labels_per_image) + keypoints.append(keypoints_per_image) + + return labels, keypoints + + def subsample(self, proposals, targets): + """ + This method performs the positive/negative sampling, and return + the sampled proposals. + Note: this function keeps a state. + + Arguments: + proposals (list[BoxList]) + targets (list[BoxList]) + """ + + labels, keypoints = self.prepare_targets(proposals, targets) + sampled_pos_inds, sampled_neg_inds = self.fg_bg_sampler(labels) + + proposals = list(proposals) + # add corresponding label and regression_targets information to the bounding boxes + for labels_per_image, keypoints_per_image, proposals_per_image in zip( + labels, keypoints, proposals + ): + proposals_per_image.add_field("labels", labels_per_image) + proposals_per_image.add_field("keypoints", keypoints_per_image) + + # distributed sampled proposals, that were obtained on all feature maps + # concatenated via the fg_bg_sampler, into individual feature map levels + for img_idx, (pos_inds_img, neg_inds_img) in enumerate( + zip(sampled_pos_inds, sampled_neg_inds) + ): + img_sampled_inds = torch.nonzero(pos_inds_img).squeeze(1) + proposals_per_image = proposals[img_idx][img_sampled_inds] + proposals[img_idx] = proposals_per_image + + self._proposals = proposals + return proposals + + def __call__(self, proposals, keypoint_logits): + heatmaps = [] + valid = [] + for proposals_per_image in proposals: + kp = proposals_per_image.get_field("keypoints") + heatmaps_per_image, valid_per_image = project_keypoints_to_heatmap( + kp, proposals_per_image, self.discretization_size + ) + heatmaps.append(heatmaps_per_image.view(-1)) + valid.append(valid_per_image.view(-1)) + + keypoint_targets = cat(heatmaps, dim=0) + valid = cat(valid, dim=0).to(dtype=torch.uint8) + valid = torch.nonzero(valid).squeeze(1) + + # torch.mean (in binary_cross_entropy_with_logits) does'nt + # accept empty tensors, so handle it sepaartely + if keypoint_targets.numel() == 0 or len(valid) == 0: + return keypoint_logits.sum() * 0 + + N, K, H, W = keypoint_logits.shape + keypoint_logits = keypoint_logits.view(N * K, H * W) + + keypoint_loss = F.cross_entropy(keypoint_logits[valid], keypoint_targets[valid]) + return keypoint_loss + + +def make_roi_keypoint_loss_evaluator(cfg): + matcher = Matcher( + cfg.MODEL.ROI_HEADS.FG_IOU_THRESHOLD, + cfg.MODEL.ROI_HEADS.BG_IOU_THRESHOLD, + allow_low_quality_matches=False, + ) + fg_bg_sampler = BalancedPositiveNegativeSampler( + cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE, cfg.MODEL.ROI_HEADS.POSITIVE_FRACTION + ) + resolution = cfg.MODEL.ROI_KEYPOINT_HEAD.RESOLUTION + loss_evaluator = KeypointRCNNLossComputation(matcher, fg_bg_sampler, resolution) + return loss_evaluator diff --git a/maskrcnn_benchmark/modeling/roi_heads/keypoint_head/roi_keypoint_feature_extractors.py b/maskrcnn_benchmark/modeling/roi_heads/keypoint_head/roi_keypoint_feature_extractors.py new file mode 100644 index 0000000000000000000000000000000000000000..952ae81c97117df349e1910df08cfe82e87c193a --- /dev/null +++ b/maskrcnn_benchmark/modeling/roi_heads/keypoint_head/roi_keypoint_feature_extractors.py @@ -0,0 +1,50 @@ +from torch import nn +from torch.nn import functional as F + +from maskrcnn_benchmark.modeling import registry +from maskrcnn_benchmark.modeling.poolers import Pooler + +from maskrcnn_benchmark.layers import Conv2d + + +@registry.ROI_KEYPOINT_FEATURE_EXTRACTORS.register("KeypointRCNNFeatureExtractor") +class KeypointRCNNFeatureExtractor(nn.Module): + def __init__(self, cfg, in_channels): + super(KeypointRCNNFeatureExtractor, self).__init__() + + resolution = cfg.MODEL.ROI_KEYPOINT_HEAD.POOLER_RESOLUTION + scales = cfg.MODEL.ROI_KEYPOINT_HEAD.POOLER_SCALES + sampling_ratio = cfg.MODEL.ROI_KEYPOINT_HEAD.POOLER_SAMPLING_RATIO + pooler = Pooler( + output_size=(resolution, resolution), + scales=scales, + sampling_ratio=sampling_ratio, + ) + self.pooler = pooler + + input_features = in_channels + layers = cfg.MODEL.ROI_KEYPOINT_HEAD.CONV_LAYERS + next_feature = input_features + self.blocks = [] + for layer_idx, layer_features in enumerate(layers, 1): + layer_name = "conv_fcn{}".format(layer_idx) + module = Conv2d(next_feature, layer_features, 3, stride=1, padding=1) + nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu") + nn.init.constant_(module.bias, 0) + self.add_module(layer_name, module) + next_feature = layer_features + self.blocks.append(layer_name) + self.out_channels = layer_features + + def forward(self, x, proposals): + x = self.pooler(x, proposals) + for layer_name in self.blocks: + x = F.relu(getattr(self, layer_name)(x)) + return x + + +def make_roi_keypoint_feature_extractor(cfg, in_channels): + func = registry.ROI_KEYPOINT_FEATURE_EXTRACTORS[ + cfg.MODEL.ROI_KEYPOINT_HEAD.FEATURE_EXTRACTOR + ] + return func(cfg, in_channels) diff --git a/maskrcnn_benchmark/modeling/roi_heads/keypoint_head/roi_keypoint_predictors.py b/maskrcnn_benchmark/modeling/roi_heads/keypoint_head/roi_keypoint_predictors.py new file mode 100644 index 0000000000000000000000000000000000000000..7193efc25b5204e6f7f549a1a073111dff935473 --- /dev/null +++ b/maskrcnn_benchmark/modeling/roi_heads/keypoint_head/roi_keypoint_predictors.py @@ -0,0 +1,38 @@ +from torch import nn + +from maskrcnn_benchmark import layers +from maskrcnn_benchmark.modeling import registry + + +@registry.ROI_KEYPOINT_PREDICTOR.register("KeypointRCNNPredictor") +class KeypointRCNNPredictor(nn.Module): + def __init__(self, cfg, in_channels): + super(KeypointRCNNPredictor, self).__init__() + input_features = in_channels + num_keypoints = cfg.MODEL.ROI_KEYPOINT_HEAD.NUM_CLASSES + deconv_kernel = 4 + self.kps_score_lowres = layers.ConvTranspose2d( + input_features, + num_keypoints, + deconv_kernel, + stride=2, + padding=deconv_kernel // 2 - 1, + ) + nn.init.kaiming_normal_( + self.kps_score_lowres.weight, mode="fan_out", nonlinearity="relu" + ) + nn.init.constant_(self.kps_score_lowres.bias, 0) + self.up_scale = 2 + self.out_channels = num_keypoints + + def forward(self, x): + x = self.kps_score_lowres(x) + x = layers.interpolate( + x, scale_factor=self.up_scale, mode="bilinear", align_corners=False + ) + return x + + +def make_roi_keypoint_predictor(cfg, in_channels): + func = registry.ROI_KEYPOINT_PREDICTOR[cfg.MODEL.ROI_KEYPOINT_HEAD.PREDICTOR] + return func(cfg, in_channels) diff --git a/maskrcnn_benchmark/modeling/roi_heads/mask_head/__init__.py b/maskrcnn_benchmark/modeling/roi_heads/mask_head/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/maskrcnn_benchmark/modeling/roi_heads/mask_head/inference.py b/maskrcnn_benchmark/modeling/roi_heads/mask_head/inference.py new file mode 100644 index 0000000000000000000000000000000000000000..e89d513c9e0d8a6746f7a236e44678a835000185 --- /dev/null +++ b/maskrcnn_benchmark/modeling/roi_heads/mask_head/inference.py @@ -0,0 +1,204 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import numpy as np +import torch +from torch import nn +import torch.nn.functional as F + +from maskrcnn_benchmark.structures.bounding_box import BoxList + + +# TODO check if want to return a single BoxList or a composite +# object +class MaskPostProcessor(nn.Module): + """ + From the results of the CNN, post process the masks + by taking the mask corresponding to the class with max + probability (which are of fixed size and directly output + by the CNN) and return the masks in the mask field of the BoxList. + + If a masker object is passed, it will additionally + project the masks in the image according to the locations in boxes, + """ + + def __init__(self, masker=None): + super(MaskPostProcessor, self).__init__() + self.masker = masker + + def forward(self, x, boxes): + """ + Arguments: + x (Tensor): the mask logits + boxes (list[BoxList]): bounding boxes that are used as + reference, one for ech image + + Returns: + results (list[BoxList]): one BoxList for each image, containing + the extra field mask + """ + mask_prob = x.sigmoid() + + # select masks coresponding to the predicted classes + num_masks = x.shape[0] + labels = [bbox.get_field("labels") for bbox in boxes] + labels = torch.cat(labels) + index = torch.arange(num_masks, device=labels.device) + mask_prob = mask_prob[index, labels][:, None] + + boxes_per_image = [len(box) for box in boxes] + mask_prob = mask_prob.split(boxes_per_image, dim=0) + + if self.masker: + mask_prob = self.masker(mask_prob, boxes) + + results = [] + for prob, box in zip(mask_prob, boxes): + bbox = BoxList(box.bbox, box.size, mode="xyxy") + for field in box.fields(): + bbox.add_field(field, box.get_field(field)) + bbox.add_field("mask", prob) + results.append(bbox) + + return results + + +class MaskPostProcessorCOCOFormat(MaskPostProcessor): + """ + From the results of the CNN, post process the results + so that the masks are pasted in the image, and + additionally convert the results to COCO format. + """ + + def forward(self, x, boxes): + import pycocotools.mask as mask_util + import numpy as np + + results = super(MaskPostProcessorCOCOFormat, self).forward(x, boxes) + for result in results: + masks = result.get_field("mask").cpu() + rles = [ + mask_util.encode(np.array(mask[0, :, :, np.newaxis], order="F"))[0] + for mask in masks + ] + for rle in rles: + rle["counts"] = rle["counts"].decode("utf-8") + result.add_field("mask", rles) + return results + + +# the next two functions should be merged inside Masker +# but are kept here for the moment while we need them +# temporarily gor paste_mask_in_image +def expand_boxes(boxes, scale): + w_half = (boxes[:, 2] - boxes[:, 0]) * .5 + h_half = (boxes[:, 3] - boxes[:, 1]) * .5 + x_c = (boxes[:, 2] + boxes[:, 0]) * .5 + y_c = (boxes[:, 3] + boxes[:, 1]) * .5 + + w_half *= scale + h_half *= scale + + boxes_exp = torch.zeros_like(boxes) + boxes_exp[:, 0] = x_c - w_half + boxes_exp[:, 2] = x_c + w_half + boxes_exp[:, 1] = y_c - h_half + boxes_exp[:, 3] = y_c + h_half + return boxes_exp + + +def expand_masks(mask, padding): + N = mask.shape[0] + M = mask.shape[-1] + pad2 = 2 * padding + scale = float(M + pad2) / M + padded_mask = mask.new_zeros((N, 1, M + pad2, M + pad2)) + padded_mask[:, :, padding:-padding, padding:-padding] = mask + return padded_mask, scale + + +def paste_mask_in_image(mask, box, im_h, im_w, thresh=0.5, padding=1): + padded_mask, scale = expand_masks(mask[None], padding=padding) + mask = padded_mask[0, 0] + box = expand_boxes(box[None], scale)[0] + box = box.to(dtype=torch.int32) + + TO_REMOVE = 1 + w = int(box[2] - box[0] + TO_REMOVE) + h = int(box[3] - box[1] + TO_REMOVE) + w = max(w, 1) + h = max(h, 1) + + # Set shape to [batchxCxHxW] + mask = mask.expand((1, 1, -1, -1)) + + # Resize mask + mask = mask.to(torch.float32) + mask = F.interpolate(mask, size=(h, w), mode='bilinear', align_corners=False) + mask = mask[0][0] + + if thresh >= 0: + mask = mask > thresh + else: + # for visualization and debugging, we also + # allow it to return an unmodified mask + mask = (mask * 255).to(torch.uint8) + + im_mask = torch.zeros((im_h, im_w), dtype=torch.uint8) + x_0 = max(box[0], 0) + x_1 = min(box[2] + 1, im_w) + y_0 = max(box[1], 0) + y_1 = min(box[3] + 1, im_h) + + im_mask[y_0:y_1, x_0:x_1] = mask[ + (y_0 - box[1]) : (y_1 - box[1]), (x_0 - box[0]) : (x_1 - box[0]) + ] + return im_mask + + +class Masker(object): + """ + Projects a set of masks in an image on the locations + specified by the bounding boxes + """ + + def __init__(self, threshold=0.5, padding=1): + self.threshold = threshold + self.padding = padding + + def forward_single_image(self, masks, boxes): + boxes = boxes.convert("xyxy") + im_w, im_h = boxes.size + res = [ + paste_mask_in_image(mask[0], box, im_h, im_w, self.threshold, self.padding) + for mask, box in zip(masks, boxes.bbox) + ] + if len(res) > 0: + res = torch.stack(res, dim=0)[:, None] + else: + res = masks.new_empty((0, 1, masks.shape[-2], masks.shape[-1])) + return res + + def __call__(self, masks, boxes): + if isinstance(boxes, BoxList): + boxes = [boxes] + + # Make some sanity check + assert len(boxes) == len(masks), "Masks and boxes should have the same length." + + # TODO: Is this JIT compatible? + # If not we should make it compatible. + results = [] + for mask, box in zip(masks, boxes): + assert mask.shape[0] == len(box), "Number of objects should be the same." + result = self.forward_single_image(mask, box) + results.append(result) + return results + + +def make_roi_mask_post_processor(cfg): + if cfg.MODEL.ROI_MASK_HEAD.POSTPROCESS_MASKS: + mask_threshold = cfg.MODEL.ROI_MASK_HEAD.POSTPROCESS_MASKS_THRESHOLD + masker = Masker(threshold=mask_threshold, padding=1) + else: + masker = None + mask_post_processor = MaskPostProcessor(masker) + return mask_post_processor diff --git a/maskrcnn_benchmark/modeling/roi_heads/mask_head/loss.py b/maskrcnn_benchmark/modeling/roi_heads/mask_head/loss.py new file mode 100644 index 0000000000000000000000000000000000000000..36dcaa3252a2b07f3e3b0ee391ab0a91a5ed816a --- /dev/null +++ b/maskrcnn_benchmark/modeling/roi_heads/mask_head/loss.py @@ -0,0 +1,144 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import torch +from torch.nn import functional as F + +from maskrcnn_benchmark.layers import smooth_l1_loss +from maskrcnn_benchmark.modeling.matcher import Matcher +from maskrcnn_benchmark.structures.boxlist_ops import boxlist_iou +from maskrcnn_benchmark.modeling.utils import cat + + +def project_masks_on_boxes(segmentation_masks, proposals, discretization_size): + """ + Given segmentation masks and the bounding boxes corresponding + to the location of the masks in the image, this function + crops and resizes the masks in the position defined by the + boxes. This prepares the masks for them to be fed to the + loss computation as the targets. + + Arguments: + segmentation_masks: an instance of SegmentationMask + proposals: an instance of BoxList + """ + masks = [] + M = discretization_size + device = proposals.bbox.device + proposals = proposals.convert("xyxy") + assert segmentation_masks.size == proposals.size, "{}, {}".format( + segmentation_masks, proposals + ) + # TODO put the proposals on the CPU, as the representation for the + # masks is not efficient GPU-wise (possibly several small tensors for + # representing a single instance mask) + proposals = proposals.bbox.to(torch.device("cpu")) + for segmentation_mask, proposal in zip(segmentation_masks, proposals): + # crop the masks, resize them to the desired resolution and + # then convert them to the tensor representation, + # instead of the list representation that was used + cropped_mask = segmentation_mask.crop(proposal) + scaled_mask = cropped_mask.resize((M, M)) + mask = scaled_mask.convert(mode="mask") + masks.append(mask) + if len(masks) == 0: + return torch.empty(0, dtype=torch.float32, device=device) + return torch.stack(masks, dim=0).to(device, dtype=torch.float32) + + +class MaskRCNNLossComputation(object): + def __init__(self, proposal_matcher, discretization_size): + """ + Arguments: + proposal_matcher (Matcher) + discretization_size (int) + """ + self.proposal_matcher = proposal_matcher + self.discretization_size = discretization_size + + def match_targets_to_proposals(self, proposal, target): + match_quality_matrix = boxlist_iou(target, proposal) + matched_idxs = self.proposal_matcher(match_quality_matrix) + # Mask RCNN needs "labels" and "masks "fields for creating the targets + target = target.copy_with_fields(["labels", "masks"]) + # get the targets corresponding GT for each proposal + # NB: need to clamp the indices because we can have a single + # GT in the image, and matched_idxs can be -2, which goes + # out of bounds + matched_targets = target[matched_idxs.clamp(min=0)] + matched_targets.add_field("matched_idxs", matched_idxs) + return matched_targets + + def prepare_targets(self, proposals, targets): + labels = [] + masks = [] + for proposals_per_image, targets_per_image in zip(proposals, targets): + matched_targets = self.match_targets_to_proposals( + proposals_per_image, targets_per_image + ) + matched_idxs = matched_targets.get_field("matched_idxs") + + labels_per_image = matched_targets.get_field("labels") + labels_per_image = labels_per_image.to(dtype=torch.int64) + + # this can probably be removed, but is left here for clarity + # and completeness + neg_inds = matched_idxs == Matcher.BELOW_LOW_THRESHOLD + labels_per_image[neg_inds] = 0 + + # mask scores are only computed on positive samples + positive_inds = torch.nonzero(labels_per_image > 0).squeeze(1) + + segmentation_masks = matched_targets.get_field("masks") + segmentation_masks = segmentation_masks[positive_inds] + + positive_proposals = proposals_per_image[positive_inds] + + masks_per_image = project_masks_on_boxes( + segmentation_masks, positive_proposals, self.discretization_size + ) + + labels.append(labels_per_image) + masks.append(masks_per_image) + + return labels, masks + + def __call__(self, proposals, mask_logits, targets): + """ + Arguments: + proposals (list[BoxList]) + mask_logits (Tensor) + targets (list[BoxList]) + + Return: + mask_loss (Tensor): scalar tensor containing the loss + """ + labels, mask_targets = self.prepare_targets(proposals, targets) + + labels = cat(labels, dim=0) + mask_targets = cat(mask_targets, dim=0) + + positive_inds = torch.nonzero(labels > 0).squeeze(1) + labels_pos = labels[positive_inds] + + # torch.mean (in binary_cross_entropy_with_logits) doesn't + # accept empty tensors, so handle it separately + if mask_targets.numel() == 0: + return mask_logits.sum() * 0 + + mask_loss = F.binary_cross_entropy_with_logits( + mask_logits[positive_inds, labels_pos], mask_targets + ) + return mask_loss + + +def make_roi_mask_loss_evaluator(cfg): + matcher = Matcher( + cfg.MODEL.ROI_HEADS.FG_IOU_THRESHOLD, + cfg.MODEL.ROI_HEADS.BG_IOU_THRESHOLD, + allow_low_quality_matches=False, + ) + + loss_evaluator = MaskRCNNLossComputation( + matcher, cfg.MODEL.ROI_MASK_HEAD.RESOLUTION + ) + + return loss_evaluator diff --git a/maskrcnn_benchmark/modeling/roi_heads/mask_head/mask_head.py b/maskrcnn_benchmark/modeling/roi_heads/mask_head/mask_head.py new file mode 100644 index 0000000000000000000000000000000000000000..a9ce245b618287f6dfe1730db83b625094b0592e --- /dev/null +++ b/maskrcnn_benchmark/modeling/roi_heads/mask_head/mask_head.py @@ -0,0 +1,83 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import torch +from torch import nn + +from maskrcnn_benchmark.structures.bounding_box import BoxList + +from .roi_mask_feature_extractors import make_roi_mask_feature_extractor +from .roi_mask_predictors import make_roi_mask_predictor +from .inference import make_roi_mask_post_processor +from .loss import make_roi_mask_loss_evaluator + + +def keep_only_positive_boxes(boxes): + """ + Given a set of BoxList containing the `labels` field, + return a set of BoxList for which `labels > 0`. + + Arguments: + boxes (list of BoxList) + """ + assert isinstance(boxes, (list, tuple)) + assert isinstance(boxes[0], BoxList) + assert boxes[0].has_field("labels") + positive_boxes = [] + positive_inds = [] + num_boxes = 0 + for boxes_per_image in boxes: + labels = boxes_per_image.get_field("labels") + inds_mask = labels > 0 + inds = inds_mask.nonzero().squeeze(1) + positive_boxes.append(boxes_per_image[inds]) + positive_inds.append(inds_mask) + return positive_boxes, positive_inds + + +class ROIMaskHead(torch.nn.Module): + def __init__(self, cfg, in_channels): + super(ROIMaskHead, self).__init__() + self.cfg = cfg.clone() + self.feature_extractor = make_roi_mask_feature_extractor(cfg, in_channels) + self.predictor = make_roi_mask_predictor( + cfg, self.feature_extractor.out_channels) + self.post_processor = make_roi_mask_post_processor(cfg) + self.loss_evaluator = make_roi_mask_loss_evaluator(cfg) + + def forward(self, features, proposals, targets=None): + """ + Arguments: + features (list[Tensor]): feature-maps from possibly several levels + proposals (list[BoxList]): proposal boxes + targets (list[BoxList], optional): the ground-truth targets. + + Returns: + x (Tensor): the result of the feature extractor + proposals (list[BoxList]): during training, the original proposals + are returned. During testing, the predicted boxlists are returned + with the `mask` field set + losses (dict[Tensor]): During training, returns the losses for the + head. During testing, returns an empty dict. + """ + + if self.training: + # during training, only focus on positive boxes + all_proposals = proposals + proposals, positive_inds = keep_only_positive_boxes(proposals) + if self.training and self.cfg.MODEL.ROI_MASK_HEAD.SHARE_BOX_FEATURE_EXTRACTOR: + x = features + x = x[torch.cat(positive_inds, dim=0)] + else: + x = self.feature_extractor(features, proposals) + mask_logits = self.predictor(x) + + if not self.training: + result = self.post_processor(mask_logits, proposals) + return x, result, {} + + loss_mask = self.loss_evaluator(proposals, mask_logits, targets) + + return x, all_proposals, dict(loss_mask=loss_mask) + + +def build_roi_mask_head(cfg, in_channels): + return ROIMaskHead(cfg, in_channels) diff --git a/maskrcnn_benchmark/modeling/roi_heads/mask_head/roi_mask_feature_extractors.py b/maskrcnn_benchmark/modeling/roi_heads/mask_head/roi_mask_feature_extractors.py new file mode 100644 index 0000000000000000000000000000000000000000..117edc4cc2b7af3758e14e88ef509b7370a30323 --- /dev/null +++ b/maskrcnn_benchmark/modeling/roi_heads/mask_head/roi_mask_feature_extractors.py @@ -0,0 +1,72 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +from torch import nn +from torch.nn import functional as F + +from ..box_head.roi_box_feature_extractors import ResNet50Conv5ROIFeatureExtractor +from maskrcnn_benchmark.modeling import registry +from maskrcnn_benchmark.modeling.poolers import Pooler +from maskrcnn_benchmark.modeling.make_layers import make_conv3x3 + + +registry.ROI_MASK_FEATURE_EXTRACTORS.register( + "ResNet50Conv5ROIFeatureExtractor", ResNet50Conv5ROIFeatureExtractor +) + + +@registry.ROI_MASK_FEATURE_EXTRACTORS.register("MaskRCNNFPNFeatureExtractor") +class MaskRCNNFPNFeatureExtractor(nn.Module): + """ + Heads for FPN for classification + """ + + def __init__(self, cfg, in_channels): + """ + Arguments: + num_classes (int): number of output classes + input_size (int): number of channels of the input once it's flattened + representation_size (int): size of the intermediate representation + """ + super(MaskRCNNFPNFeatureExtractor, self).__init__() + + resolution = cfg.MODEL.ROI_MASK_HEAD.POOLER_RESOLUTION + scales = cfg.MODEL.ROI_MASK_HEAD.POOLER_SCALES + sampling_ratio = cfg.MODEL.ROI_MASK_HEAD.POOLER_SAMPLING_RATIO + pooler = Pooler( + output_size=(resolution, resolution), + scales=scales, + sampling_ratio=sampling_ratio, + ) + input_size = in_channels + self.pooler = pooler + + use_gn = cfg.MODEL.ROI_MASK_HEAD.USE_GN + layers = cfg.MODEL.ROI_MASK_HEAD.CONV_LAYERS + dilation = cfg.MODEL.ROI_MASK_HEAD.DILATION + + next_feature = input_size + self.blocks = [] + for layer_idx, layer_features in enumerate(layers, 1): + layer_name = "mask_fcn{}".format(layer_idx) + module = make_conv3x3( + next_feature, layer_features, + dilation=dilation, stride=1, use_gn=use_gn + ) + self.add_module(layer_name, module) + next_feature = layer_features + self.blocks.append(layer_name) + self.out_channels = layer_features + + def forward(self, x, proposals): + x = self.pooler(x, proposals) + + for layer_name in self.blocks: + x = F.relu(getattr(self, layer_name)(x)) + + return x + + +def make_roi_mask_feature_extractor(cfg, in_channels): + func = registry.ROI_MASK_FEATURE_EXTRACTORS[ + cfg.MODEL.ROI_MASK_HEAD.FEATURE_EXTRACTOR + ] + return func(cfg, in_channels) diff --git a/maskrcnn_benchmark/modeling/roi_heads/mask_head/roi_mask_predictors.py b/maskrcnn_benchmark/modeling/roi_heads/mask_head/roi_mask_predictors.py new file mode 100644 index 0000000000000000000000000000000000000000..c954e332eb0288c168fa3467f34d103b059254c0 --- /dev/null +++ b/maskrcnn_benchmark/modeling/roi_heads/mask_head/roi_mask_predictors.py @@ -0,0 +1,57 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +from torch import nn +from torch.nn import functional as F + +from maskrcnn_benchmark.layers import Conv2d +from maskrcnn_benchmark.layers import ConvTranspose2d +from maskrcnn_benchmark.modeling import registry + + +@registry.ROI_MASK_PREDICTOR.register("MaskRCNNC4Predictor") +class MaskRCNNC4Predictor(nn.Module): + def __init__(self, cfg, in_channels): + super(MaskRCNNC4Predictor, self).__init__() + num_classes = cfg.MODEL.ROI_BOX_HEAD.NUM_CLASSES + dim_reduced = cfg.MODEL.ROI_MASK_HEAD.CONV_LAYERS[-1] + num_inputs = in_channels + + self.conv5_mask = ConvTranspose2d(num_inputs, dim_reduced, 2, 2, 0) + self.mask_fcn_logits = Conv2d(dim_reduced, num_classes, 1, 1, 0) + + for name, param in self.named_parameters(): + if "bias" in name: + nn.init.constant_(param, 0) + elif "weight" in name: + # Caffe2 implementation uses MSRAFill, which in fact + # corresponds to kaiming_normal_ in PyTorch + nn.init.kaiming_normal_(param, mode="fan_out", nonlinearity="relu") + + def forward(self, x): + x = F.relu(self.conv5_mask(x)) + return self.mask_fcn_logits(x) + + +@registry.ROI_MASK_PREDICTOR.register("MaskRCNNConv1x1Predictor") +class MaskRCNNConv1x1Predictor(nn.Module): + def __init__(self, cfg, in_channels): + super(MaskRCNNConv1x1Predictor, self).__init__() + num_classes = cfg.MODEL.ROI_BOX_HEAD.NUM_CLASSES + num_inputs = in_channels + + self.mask_fcn_logits = Conv2d(num_inputs, num_classes, 1, 1, 0) + + for name, param in self.named_parameters(): + if "bias" in name: + nn.init.constant_(param, 0) + elif "weight" in name: + # Caffe2 implementation uses MSRAFill, which in fact + # corresponds to kaiming_normal_ in PyTorch + nn.init.kaiming_normal_(param, mode="fan_out", nonlinearity="relu") + + def forward(self, x): + return self.mask_fcn_logits(x) + + +def make_roi_mask_predictor(cfg, in_channels): + func = registry.ROI_MASK_PREDICTOR[cfg.MODEL.ROI_MASK_HEAD.PREDICTOR] + return func(cfg, in_channels) diff --git a/maskrcnn_benchmark/modeling/roi_heads/roi_heads.py b/maskrcnn_benchmark/modeling/roi_heads/roi_heads.py new file mode 100644 index 0000000000000000000000000000000000000000..ee3c8e1d291b03b7982630d3c905cfa21f58fd27 --- /dev/null +++ b/maskrcnn_benchmark/modeling/roi_heads/roi_heads.py @@ -0,0 +1,85 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import torch + +from maskrcnn_benchmark.modeling.roi_heads.heatmap_head.heatmap_head import build_heatmap_head +from .box_head.box_head import build_roi_box_head +from .mask_head.mask_head import build_roi_mask_head +from .keypoint_head.keypoint_head import build_roi_keypoint_head + + +class CombinedROIHeads(torch.nn.ModuleDict): + """ + Combines a set of individual heads (for box prediction or masks) into a single + head. + """ + + def __init__(self, cfg, heads): + super(CombinedROIHeads, self).__init__(heads) + self.cfg = cfg.clone() + if cfg.MODEL.MASK_ON and cfg.MODEL.ROI_MASK_HEAD.SHARE_BOX_FEATURE_EXTRACTOR: + self.mask.feature_extractor = self.box.feature_extractor + if cfg.MODEL.KEYPOINT_ON and cfg.MODEL.ROI_KEYPOINT_HEAD.SHARE_BOX_FEATURE_EXTRACTOR: + self.keypoint.feature_extractor = self.box.feature_extractor + + def forward(self, features, proposals, targets=None): + losses = {} + # TODO rename x to roi_box_features, if it doesn't increase memory consumption + x, detections, loss_box = self.box(features, proposals, targets) + losses.update(loss_box) + if self.cfg.MODEL.MASK_ON: + mask_features = features + # optimization: during training, if we share the feature extractor between + # the box and the mask heads, then we can reuse the features already computed + if ( + self.training + and self.cfg.MODEL.ROI_MASK_HEAD.SHARE_BOX_FEATURE_EXTRACTOR + ): + mask_features = x + # During training, self.box() will return the unaltered proposals as "detections" + # this makes the API consistent during training and testing + x, detections, loss_mask = self.mask(mask_features, detections, targets) + losses.update(loss_mask) + + if self.cfg.MODEL.KEYPOINT_ON: + keypoint_features = features + # optimization: during training, if we share the feature extractor between + # the box and the mask heads, then we can reuse the features already computed + if ( + self.training + and self.cfg.MODEL.ROI_KEYPOINT_HEAD.SHARE_BOX_FEATURE_EXTRACTOR + ): + keypoint_features = x + # During training, self.box() will return the unaltered proposals as "detections" + # this makes the API consistent during training and testing + x, detections, loss_keypoint = self.keypoint(keypoint_features, detections, targets) + losses.update(loss_keypoint) + + if self.cfg.MODEL.HEATMAP_ON: + heatmap_features = features + x, density_logits, detections, heatmap_loss = self.heatmap_head(heatmap_features, detections, targets) + losses.update(heatmap_loss) + + return x, detections, losses + + +def build_roi_heads(cfg, in_channels, need_density_head=True): + # individually create the heads, that will be combined together + # afterwards + roi_heads = [] + if cfg.MODEL.RETINANET_ON: + return [] + + if not cfg.MODEL.RPN_ONLY: + roi_heads.append(("box", build_roi_box_head(cfg, in_channels))) + if cfg.MODEL.MASK_ON: + roi_heads.append(("mask", build_roi_mask_head(cfg, in_channels))) + if cfg.MODEL.KEYPOINT_ON: + roi_heads.append(("keypoint", build_roi_keypoint_head(cfg, in_channels))) + if cfg.MODEL.HEATMAP_ON: + roi_heads.append(("heatmap_head", build_heatmap_head(cfg, in_channels))) + + # combine individual heads in a single module + if roi_heads: + roi_heads = CombinedROIHeads(cfg, roi_heads) + + return roi_heads diff --git a/maskrcnn_benchmark/modeling/rpn/__init__.py b/maskrcnn_benchmark/modeling/rpn/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b01f30cfddd8ed97d5a39f55641fbc929297d885 --- /dev/null +++ b/maskrcnn_benchmark/modeling/rpn/__init__.py @@ -0,0 +1,2 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +# from .rpn import build_rpn diff --git a/maskrcnn_benchmark/modeling/rpn/anchor_generator.py b/maskrcnn_benchmark/modeling/rpn/anchor_generator.py new file mode 100644 index 0000000000000000000000000000000000000000..c3f7ef1d8d77d09695e4aad5b11d65bf8c8f1209 --- /dev/null +++ b/maskrcnn_benchmark/modeling/rpn/anchor_generator.py @@ -0,0 +1,289 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import math + +import numpy as np +import torch +from torch import nn + +from maskrcnn_benchmark.structures.bounding_box import BoxList + + +class BufferList(nn.Module): + """ + Similar to nn.ParameterList, but for buffers + """ + + def __init__(self, buffers=None): + super(BufferList, self).__init__() + if buffers is not None: + self.extend(buffers) + + def extend(self, buffers): + offset = len(self) + for i, buffer in enumerate(buffers): + self.register_buffer(str(offset + i), buffer) + return self + + def __len__(self): + return len(self._buffers) + + def __iter__(self): + return iter(self._buffers.values()) + + +class AnchorGenerator(nn.Module): + """ + For a set of image sizes and feature maps, computes a set + of anchors + """ + + def __init__( + self, + sizes=(128, 256, 512), + aspect_ratios=(0.5, 1.0, 2.0), + anchor_strides=(8, 16, 32), + straddle_thresh=0, + ): + super(AnchorGenerator, self).__init__() + + if len(anchor_strides) == 1: + anchor_stride = anchor_strides[0] + cell_anchors = [ + generate_anchors(anchor_stride, sizes, aspect_ratios).float() + ] + else: + if len(anchor_strides) != len(sizes): + raise RuntimeError("FPN should have #anchor_strides == #sizes") + + cell_anchors = [ + generate_anchors( + anchor_stride, + size if isinstance(size, (tuple, list)) else (size,), + aspect_ratios + ).float() + for anchor_stride, size in zip(anchor_strides, sizes) + ] + self.strides = anchor_strides + self.cell_anchors = BufferList(cell_anchors) + self.straddle_thresh = straddle_thresh + + def num_anchors_per_location(self): + return [len(cell_anchors) for cell_anchors in self.cell_anchors] + + def grid_anchors(self, grid_sizes): + anchors = [] + for size, stride, base_anchors in zip( + grid_sizes, self.strides, self.cell_anchors + ): + grid_height, grid_width = size + device = base_anchors.device + shifts_x = torch.arange( + 0, grid_width * stride, step=stride, dtype=torch.float32, device=device + ) + shifts_y = torch.arange( + 0, grid_height * stride, step=stride, dtype=torch.float32, device=device + ) + shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x) + shift_x = shift_x.reshape(-1) + shift_y = shift_y.reshape(-1) + shifts = torch.stack((shift_x, shift_y, shift_x, shift_y), dim=1) + + anchors.append( + (shifts.view(-1, 1, 4) + base_anchors.view(1, -1, 4)).reshape(-1, 4) + ) + + return anchors + + def add_visibility_to(self, boxlist): + image_width, image_height = boxlist.size + anchors = boxlist.bbox + if self.straddle_thresh >= 0: + inds_inside = ( + (anchors[..., 0] >= -self.straddle_thresh) + & (anchors[..., 1] >= -self.straddle_thresh) + & (anchors[..., 2] < image_width + self.straddle_thresh) + & (anchors[..., 3] < image_height + self.straddle_thresh) + ) + else: + device = anchors.device + inds_inside = torch.ones(anchors.shape[0], dtype=torch.uint8, device=device) + boxlist.add_field("visibility", inds_inside) + + def forward(self, image_list, feature_maps): + grid_sizes = [feature_map.shape[-2:] for feature_map in feature_maps] + anchors_over_all_feature_maps = self.grid_anchors(grid_sizes) + anchors = [] + for i, (image_height, image_width) in enumerate(image_list.image_sizes): + anchors_in_image = [] + for anchors_per_feature_map in anchors_over_all_feature_maps: + boxlist = BoxList( + anchors_per_feature_map, (image_width, image_height), mode="xyxy" + ) + self.add_visibility_to(boxlist) + anchors_in_image.append(boxlist) + anchors.append(anchors_in_image) + return anchors + + +def make_anchor_generator(config): + anchor_sizes = config.MODEL.RPN.ANCHOR_SIZES + aspect_ratios = config.MODEL.RPN.ASPECT_RATIOS + anchor_stride = config.MODEL.RPN.ANCHOR_STRIDE + straddle_thresh = config.MODEL.RPN.STRADDLE_THRESH + + if config.MODEL.RPN.USE_FPN: + assert len(anchor_stride) == len( + anchor_sizes + ), "FPN should have len(ANCHOR_STRIDE) == len(ANCHOR_SIZES)" + else: + assert len(anchor_stride) == 1, "Non-FPN should have a single ANCHOR_STRIDE" + anchor_generator = AnchorGenerator( + anchor_sizes, aspect_ratios, anchor_stride, straddle_thresh + ) + return anchor_generator + + +def make_anchor_generator_retinanet(config): + anchor_sizes = config.MODEL.RETINANET.ANCHOR_SIZES + aspect_ratios = config.MODEL.RETINANET.ASPECT_RATIOS + anchor_strides = config.MODEL.RETINANET.ANCHOR_STRIDES + straddle_thresh = config.MODEL.RETINANET.STRADDLE_THRESH + octave = config.MODEL.RETINANET.OCTAVE + scales_per_octave = config.MODEL.RETINANET.SCALES_PER_OCTAVE + + assert len(anchor_strides) == len(anchor_sizes), "Only support FPN now" + new_anchor_sizes = [] + for size in anchor_sizes: + per_layer_anchor_sizes = [] + for scale_per_octave in range(scales_per_octave): + octave_scale = octave ** (scale_per_octave / float(scales_per_octave)) + per_layer_anchor_sizes.append(octave_scale * size) + new_anchor_sizes.append(tuple(per_layer_anchor_sizes)) + + anchor_generator = AnchorGenerator( + tuple(new_anchor_sizes), aspect_ratios, anchor_strides, straddle_thresh + ) + return anchor_generator + +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## +# +# Based on: +# -------------------------------------------------------- +# Faster R-CNN +# Copyright (c) 2015 Microsoft +# Licensed under The MIT License [see LICENSE for details] +# Written by Ross Girshick and Sean Bell +# -------------------------------------------------------- + + +# Verify that we compute the same anchors as Shaoqing's matlab implementation: +# +# >> load output/rpn_cachedir/faster_rcnn_VOC2007_ZF_stage1_rpn/anchors.mat +# >> anchors +# +# anchors = +# +# -83 -39 100 56 +# -175 -87 192 104 +# -359 -183 376 200 +# -55 -55 72 72 +# -119 -119 136 136 +# -247 -247 264 264 +# -35 -79 52 96 +# -79 -167 96 184 +# -167 -343 184 360 + +# array([[ -83., -39., 100., 56.], +# [-175., -87., 192., 104.], +# [-359., -183., 376., 200.], +# [ -55., -55., 72., 72.], +# [-119., -119., 136., 136.], +# [-247., -247., 264., 264.], +# [ -35., -79., 52., 96.], +# [ -79., -167., 96., 184.], +# [-167., -343., 184., 360.]]) + + +def generate_anchors( + stride=16, sizes=(32, 64, 128, 256, 512), aspect_ratios=(0.5, 1, 2) +): + """Generates a matrix of anchor boxes in (x1, y1, x2, y2) format. Anchors + are centered on stride / 2, have (approximate) sqrt areas of the specified + sizes, and aspect ratios as given. + """ + return _generate_anchors( + stride, + np.array(sizes, dtype=np.float) / stride, + np.array(aspect_ratios, dtype=np.float), + ) + + +def _generate_anchors(base_size, scales, aspect_ratios): + """Generate anchor (reference) windows by enumerating aspect ratios X + scales wrt a reference (0, 0, base_size - 1, base_size - 1) window. + """ + anchor = np.array([1, 1, base_size, base_size], dtype=np.float) - 1 + anchors = _ratio_enum(anchor, aspect_ratios) + anchors = np.vstack( + [_scale_enum(anchors[i, :], scales) for i in range(anchors.shape[0])] + ) + return torch.from_numpy(anchors) + + +def _whctrs(anchor): + """Return width, height, x center, and y center for an anchor (window).""" + w = anchor[2] - anchor[0] + 1 + h = anchor[3] - anchor[1] + 1 + x_ctr = anchor[0] + 0.5 * (w - 1) + y_ctr = anchor[1] + 0.5 * (h - 1) + return w, h, x_ctr, y_ctr + + +def _mkanchors(ws, hs, x_ctr, y_ctr): + """Given a vector of widths (ws) and heights (hs) around a center + (x_ctr, y_ctr), output a set of anchors (windows). + """ + ws = ws[:, np.newaxis] + hs = hs[:, np.newaxis] + anchors = np.hstack( + ( + x_ctr - 0.5 * (ws - 1), + y_ctr - 0.5 * (hs - 1), + x_ctr + 0.5 * (ws - 1), + y_ctr + 0.5 * (hs - 1), + ) + ) + return anchors + + +def _ratio_enum(anchor, ratios): + """Enumerate a set of anchors for each aspect ratio wrt an anchor.""" + w, h, x_ctr, y_ctr = _whctrs(anchor) + size = w * h + size_ratios = size / ratios + ws = np.round(np.sqrt(size_ratios)) + hs = np.round(ws * ratios) + anchors = _mkanchors(ws, hs, x_ctr, y_ctr) + return anchors + + +def _scale_enum(anchor, scales): + """Enumerate a set of anchors for each scale wrt an anchor.""" + w, h, x_ctr, y_ctr = _whctrs(anchor) + ws = w * scales + hs = h * scales + anchors = _mkanchors(ws, hs, x_ctr, y_ctr) + return anchors diff --git a/maskrcnn_benchmark/modeling/rpn/inference.py b/maskrcnn_benchmark/modeling/rpn/inference.py new file mode 100644 index 0000000000000000000000000000000000000000..556082645b65e4cb7ce70cc362ce5f0cd77227e1 --- /dev/null +++ b/maskrcnn_benchmark/modeling/rpn/inference.py @@ -0,0 +1,202 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import torch + +from maskrcnn_benchmark.modeling.box_coder import BoxCoder +from maskrcnn_benchmark.structures.bounding_box import BoxList +from maskrcnn_benchmark.structures.boxlist_ops import cat_boxlist +from maskrcnn_benchmark.structures.boxlist_ops import boxlist_nms +from maskrcnn_benchmark.structures.boxlist_ops import remove_small_boxes + +from ..utils import cat +from .utils import permute_and_flatten + +class RPNPostProcessor(torch.nn.Module): + """ + Performs post-processing on the outputs of the RPN boxes, before feeding the + proposals to the heads + """ + + def __init__( + self, + pre_nms_top_n, + post_nms_top_n, + nms_thresh, + min_size, + box_coder=None, + fpn_post_nms_top_n=None, + ): + """ + Arguments: + pre_nms_top_n (int) + post_nms_top_n (int) + nms_thresh (float) + min_size (int) + box_coder (BoxCoder) + fpn_post_nms_top_n (int) + """ + super(RPNPostProcessor, self).__init__() + self.pre_nms_top_n = pre_nms_top_n + self.post_nms_top_n = post_nms_top_n + self.nms_thresh = nms_thresh + self.min_size = min_size + + if box_coder is None: + box_coder = BoxCoder(weights=(1.0, 1.0, 1.0, 1.0)) + self.box_coder = box_coder + + if fpn_post_nms_top_n is None: + fpn_post_nms_top_n = post_nms_top_n + self.fpn_post_nms_top_n = fpn_post_nms_top_n + + def add_gt_proposals(self, proposals, targets): + """ + Arguments: + proposals: list[BoxList] + targets: list[BoxList] + """ + # Get the device we're operating on + device = proposals[0].bbox.device + + gt_boxes = [target.copy_with_fields([]) for target in targets] + + # later cat of bbox requires all fields to be present for all bbox + # so we need to add a dummy for objectness that's missing + for gt_box in gt_boxes: + gt_box.add_field("objectness", torch.ones(len(gt_box), device=device)) + + proposals = [ + cat_boxlist((proposal, gt_box)) + for proposal, gt_box in zip(proposals, gt_boxes) + ] + + return proposals + + def forward_for_single_feature_map(self, anchors, objectness, box_regression): + """ + Arguments: + anchors: list[BoxList] + objectness: tensor of size N, A, H, W + box_regression: tensor of size N, A * 4, H, W + """ + device = objectness.device + N, A, H, W = objectness.shape + + # put in the same format as anchors + objectness = permute_and_flatten(objectness, N, A, 1, H, W).view(N, -1) + objectness = objectness.sigmoid() + + box_regression = permute_and_flatten(box_regression, N, A, 4, H, W) + + num_anchors = A * H * W + + pre_nms_top_n = min(self.pre_nms_top_n, num_anchors) + objectness, topk_idx = objectness.topk(pre_nms_top_n, dim=1, sorted=True) + + batch_idx = torch.arange(N, device=device)[:, None] + box_regression = box_regression[batch_idx, topk_idx] + + image_shapes = [box.size for box in anchors] + concat_anchors = torch.cat([a.bbox for a in anchors], dim=0) + concat_anchors = concat_anchors.reshape(N, -1, 4)[batch_idx, topk_idx] + + proposals = self.box_coder.decode( + box_regression.view(-1, 4), concat_anchors.view(-1, 4) + ) + + proposals = proposals.view(N, -1, 4) + + result = [] + for proposal, score, im_shape in zip(proposals, objectness, image_shapes): + boxlist = BoxList(proposal, im_shape, mode="xyxy") + boxlist.add_field("objectness", score) + boxlist = boxlist.clip_to_image(remove_empty=False) + boxlist = remove_small_boxes(boxlist, self.min_size) + boxlist = boxlist_nms( + boxlist, + self.nms_thresh, + max_proposals=self.post_nms_top_n, + score_field="objectness", + ) + result.append(boxlist) + return result + + def forward(self, anchors, objectness, box_regression, targets=None): + """ + Arguments: + anchors: list[list[BoxList]] + objectness: list[tensor] + box_regression: list[tensor] + + Returns: + boxlists (list[BoxList]): the post-processed anchors, after + applying box decoding and NMS + """ + sampled_boxes = [] + num_levels = len(objectness) + anchors = list(zip(*anchors)) + for a, o, b in zip(anchors, objectness, box_regression): + sampled_boxes.append(self.forward_for_single_feature_map(a, o, b)) + + boxlists = list(zip(*sampled_boxes)) + boxlists = [cat_boxlist(boxlist) for boxlist in boxlists] + + if num_levels > 1: + boxlists = self.select_over_all_levels(boxlists) + + # append ground-truth bboxes to proposals + if self.training and targets is not None: + boxlists = self.add_gt_proposals(boxlists, targets) + + return boxlists + + def select_over_all_levels(self, boxlists): + num_images = len(boxlists) + # different behavior during training and during testing: + # during training, post_nms_top_n is over *all* the proposals combined, while + # during testing, it is over the proposals for each image + # TODO resolve this difference and make it consistent. It should be per image, + # and not per batch + if self.training: + objectness = torch.cat( + [boxlist.get_field("objectness") for boxlist in boxlists], dim=0 + ) + box_sizes = [len(boxlist) for boxlist in boxlists] + post_nms_top_n = min(self.fpn_post_nms_top_n, len(objectness)) + _, inds_sorted = torch.topk(objectness, post_nms_top_n, dim=0, sorted=True) + inds_mask = torch.zeros_like(objectness, dtype=torch.uint8) + inds_mask[inds_sorted] = 1 + inds_mask = inds_mask.split(box_sizes) + for i in range(num_images): + boxlists[i] = boxlists[i][inds_mask[i]] + else: + for i in range(num_images): + objectness = boxlists[i].get_field("objectness") + post_nms_top_n = min(self.fpn_post_nms_top_n, len(objectness)) + _, inds_sorted = torch.topk( + objectness, post_nms_top_n, dim=0, sorted=True + ) + boxlists[i] = boxlists[i][inds_sorted] + return boxlists + + +def make_rpn_postprocessor(config, rpn_box_coder, is_train): + fpn_post_nms_top_n = config.MODEL.RPN.FPN_POST_NMS_TOP_N_TRAIN + if not is_train: + fpn_post_nms_top_n = config.MODEL.RPN.FPN_POST_NMS_TOP_N_TEST + + pre_nms_top_n = config.MODEL.RPN.PRE_NMS_TOP_N_TRAIN + post_nms_top_n = config.MODEL.RPN.POST_NMS_TOP_N_TRAIN + if not is_train: + pre_nms_top_n = config.MODEL.RPN.PRE_NMS_TOP_N_TEST + post_nms_top_n = config.MODEL.RPN.POST_NMS_TOP_N_TEST + nms_thresh = config.MODEL.RPN.NMS_THRESH + min_size = config.MODEL.RPN.MIN_SIZE + box_selector = RPNPostProcessor( + pre_nms_top_n=pre_nms_top_n, + post_nms_top_n=post_nms_top_n, + nms_thresh=nms_thresh, + min_size=min_size, + box_coder=rpn_box_coder, + fpn_post_nms_top_n=fpn_post_nms_top_n, + ) + return box_selector diff --git a/maskrcnn_benchmark/modeling/rpn/loss.py b/maskrcnn_benchmark/modeling/rpn/loss.py new file mode 100644 index 0000000000000000000000000000000000000000..840e3545338f71dbf6f9ce38e12478e1efeb0714 --- /dev/null +++ b/maskrcnn_benchmark/modeling/rpn/loss.py @@ -0,0 +1,157 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +""" +This file contains specific functions for computing losses on the RPN +file +""" + +import torch +from torch.nn import functional as F + +from .utils import concat_box_prediction_layers + +from ..balanced_positive_negative_sampler import BalancedPositiveNegativeSampler +from ..utils import cat + +from maskrcnn_benchmark.layers import smooth_l1_loss +from maskrcnn_benchmark.modeling.matcher import Matcher +from maskrcnn_benchmark.structures.boxlist_ops import boxlist_iou +from maskrcnn_benchmark.structures.boxlist_ops import cat_boxlist + + +class RPNLossComputation(object): + """ + This class computes the RPN loss. + """ + + def __init__(self, proposal_matcher, fg_bg_sampler, box_coder, + generate_labels_func): + """ + Arguments: + proposal_matcher (Matcher) + fg_bg_sampler (BalancedPositiveNegativeSampler) + box_coder (BoxCoder) + """ + # self.target_preparator = target_preparator + self.proposal_matcher = proposal_matcher + self.fg_bg_sampler = fg_bg_sampler + self.box_coder = box_coder + self.copied_fields = [] + self.generate_labels_func = generate_labels_func + self.discard_cases = ['not_visibility', 'between_thresholds'] + + def match_targets_to_anchors(self, anchor, target, copied_fields=[]): + match_quality_matrix = boxlist_iou(target, anchor) + matched_idxs = self.proposal_matcher(match_quality_matrix) + # RPN doesn't need any fields from target + # for creating the labels, so clear them all + target = target.copy_with_fields(copied_fields) + # get the targets corresponding GT for each anchor + # NB: need to clamp the indices because we can have a single + # GT in the image, and matched_idxs can be -2, which goes + # out of bounds + matched_targets = target[matched_idxs.clamp(min=0)] + matched_targets.add_field("matched_idxs", matched_idxs) + return matched_targets + + def prepare_targets(self, anchors, targets): + labels = [] + regression_targets = [] + for anchors_per_image, targets_per_image in zip(anchors, targets): + matched_targets = self.match_targets_to_anchors( + anchors_per_image, targets_per_image, self.copied_fields + ) + + matched_idxs = matched_targets.get_field("matched_idxs") + labels_per_image = self.generate_labels_func(matched_targets) + labels_per_image = labels_per_image.to(dtype=torch.float32) + + # Background (negative examples) + bg_indices = matched_idxs == Matcher.BELOW_LOW_THRESHOLD + labels_per_image[bg_indices] = 0 + + # discard anchors that go out of the boundaries of the image + if "not_visibility" in self.discard_cases: + labels_per_image[~anchors_per_image.get_field("visibility")] = -1 + + # discard indices that are between thresholds + if "between_thresholds" in self.discard_cases: + inds_to_discard = matched_idxs == Matcher.BETWEEN_THRESHOLDS + labels_per_image[inds_to_discard] = -1 + + # compute regression targets + regression_targets_per_image = self.box_coder.encode( + matched_targets.bbox, anchors_per_image.bbox + ) + + labels.append(labels_per_image) + regression_targets.append(regression_targets_per_image) + + return labels, regression_targets + + + def __call__(self, anchors, objectness, box_regression, targets): + """ + Arguments: + anchors (list[BoxList]) + objectness (list[Tensor]) + box_regression (list[Tensor]) + targets (list[BoxList]) + + Returns: + objectness_loss (Tensor) + box_loss (Tensor + """ + anchors = [cat_boxlist(anchors_per_image) for anchors_per_image in anchors] + labels, regression_targets = self.prepare_targets(anchors, targets) + sampled_pos_inds, sampled_neg_inds = self.fg_bg_sampler(labels) + sampled_pos_inds = torch.nonzero(torch.cat(sampled_pos_inds, dim=0)).squeeze(1) + sampled_neg_inds = torch.nonzero(torch.cat(sampled_neg_inds, dim=0)).squeeze(1) + + sampled_inds = torch.cat([sampled_pos_inds, sampled_neg_inds], dim=0) + + objectness, box_regression = \ + concat_box_prediction_layers(objectness, box_regression) + + objectness = objectness.squeeze() + + labels = torch.cat(labels, dim=0) + regression_targets = torch.cat(regression_targets, dim=0) + + box_loss = smooth_l1_loss( + box_regression[sampled_pos_inds], + regression_targets[sampled_pos_inds], + beta=1.0 / 9, + size_average=False, + ) / (sampled_inds.numel()) + + objectness_loss = F.binary_cross_entropy_with_logits( + objectness[sampled_inds], labels[sampled_inds] + ) + + return objectness_loss, box_loss + +# This function should be overwritten in RetinaNet +def generate_rpn_labels(matched_targets): + matched_idxs = matched_targets.get_field("matched_idxs") + labels_per_image = matched_idxs >= 0 + return labels_per_image + + +def make_rpn_loss_evaluator(cfg, box_coder): + matcher = Matcher( + cfg.MODEL.RPN.FG_IOU_THRESHOLD, + cfg.MODEL.RPN.BG_IOU_THRESHOLD, + allow_low_quality_matches=True, + ) + + fg_bg_sampler = BalancedPositiveNegativeSampler( + cfg.MODEL.RPN.BATCH_SIZE_PER_IMAGE, cfg.MODEL.RPN.POSITIVE_FRACTION + ) + + loss_evaluator = RPNLossComputation( + matcher, + fg_bg_sampler, + box_coder, + generate_rpn_labels + ) + return loss_evaluator diff --git a/maskrcnn_benchmark/modeling/rpn/retinanet/__init__.py b/maskrcnn_benchmark/modeling/rpn/retinanet/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/maskrcnn_benchmark/modeling/rpn/retinanet/inference.py b/maskrcnn_benchmark/modeling/rpn/retinanet/inference.py new file mode 100644 index 0000000000000000000000000000000000000000..09c17adfc2565871632cffecd51738a2cbd9acb2 --- /dev/null +++ b/maskrcnn_benchmark/modeling/rpn/retinanet/inference.py @@ -0,0 +1,194 @@ +import torch + +from ..inference import RPNPostProcessor +from ..utils import permute_and_flatten + +from maskrcnn_benchmark.modeling.box_coder import BoxCoder +from maskrcnn_benchmark.modeling.utils import cat +from maskrcnn_benchmark.structures.bounding_box import BoxList +from maskrcnn_benchmark.structures.boxlist_ops import cat_boxlist +from maskrcnn_benchmark.structures.boxlist_ops import boxlist_nms +from maskrcnn_benchmark.structures.boxlist_ops import remove_small_boxes + + +class RetinaNetPostProcessor(RPNPostProcessor): + """ + Performs post-processing on the outputs of the RetinaNet boxes. + This is only used in the testing. + """ + def __init__( + self, + pre_nms_thresh, + pre_nms_top_n, + nms_thresh, + fpn_post_nms_top_n, + min_size, + num_classes, + box_coder=None, + ): + """ + Arguments: + pre_nms_thresh (float) + pre_nms_top_n (int) + nms_thresh (float) + fpn_post_nms_top_n (int) + min_size (int) + num_classes (int) + box_coder (BoxCoder) + """ + super(RetinaNetPostProcessor, self).__init__( + pre_nms_thresh, 0, nms_thresh, min_size + ) + self.pre_nms_thresh = pre_nms_thresh + self.pre_nms_top_n = pre_nms_top_n + self.nms_thresh = nms_thresh + self.fpn_post_nms_top_n = fpn_post_nms_top_n + self.min_size = min_size + self.num_classes = num_classes + + if box_coder is None: + box_coder = BoxCoder(weights=(10., 10., 5., 5.)) + self.box_coder = box_coder + + def add_gt_proposals(self, proposals, targets): + """ + This function is not used in RetinaNet + """ + pass + + def forward_for_single_feature_map( + self, anchors, box_cls, box_regression): + """ + Arguments: + anchors: list[BoxList] + box_cls: tensor of size N, A * C, H, W + box_regression: tensor of size N, A * 4, H, W + """ + device = box_cls.device + N, _, H, W = box_cls.shape + A = box_regression.size(1) // 4 + C = box_cls.size(1) // A + + # put in the same format as anchors + box_cls = permute_and_flatten(box_cls, N, A, C, H, W) + box_cls = box_cls.sigmoid() + + box_regression = permute_and_flatten(box_regression, N, A, 4, H, W) + box_regression = box_regression.reshape(N, -1, 4) + + num_anchors = A * H * W + + candidate_inds = box_cls > self.pre_nms_thresh + + pre_nms_top_n = candidate_inds.view(N, -1).sum(1) + pre_nms_top_n = pre_nms_top_n.clamp(max=self.pre_nms_top_n) + + results = [] + for per_box_cls, per_box_regression, per_pre_nms_top_n, \ + per_candidate_inds, per_anchors in zip( + box_cls, + box_regression, + pre_nms_top_n, + candidate_inds, + anchors): + + # Sort and select TopN + # TODO most of this can be made out of the loop for + # all images. + # TODO:Yang: Not easy to do. Because the numbers of detections are + # different in each image. Therefore, this part needs to be done + # per image. + per_box_cls = per_box_cls[per_candidate_inds] + + per_box_cls, top_k_indices = \ + per_box_cls.topk(per_pre_nms_top_n, sorted=False) + + per_candidate_nonzeros = \ + per_candidate_inds.nonzero()[top_k_indices, :] + + per_box_loc = per_candidate_nonzeros[:, 0] + per_class = per_candidate_nonzeros[:, 1] + per_class += 1 + + detections = self.box_coder.decode( + per_box_regression[per_box_loc, :].view(-1, 4), + per_anchors.bbox[per_box_loc, :].view(-1, 4) + ) + + boxlist = BoxList(detections, per_anchors.size, mode="xyxy") + boxlist.add_field("labels", per_class) + boxlist.add_field("scores", per_box_cls) + boxlist = boxlist.clip_to_image(remove_empty=False) + boxlist = remove_small_boxes(boxlist, self.min_size) + results.append(boxlist) + + return results + + # TODO very similar to filter_results from PostProcessor + # but filter_results is per image + # TODO Yang: solve this issue in the future. No good solution + # right now. + def select_over_all_levels(self, boxlists): + num_images = len(boxlists) + results = [] + for i in range(num_images): + scores = boxlists[i].get_field("scores") + labels = boxlists[i].get_field("labels") + boxes = boxlists[i].bbox + boxlist = boxlists[i] + result = [] + # skip the background + for j in range(1, self.num_classes): + inds = (labels == j).nonzero().view(-1) + + scores_j = scores[inds] + boxes_j = boxes[inds, :].view(-1, 4) + boxlist_for_class = BoxList(boxes_j, boxlist.size, mode="xyxy") + boxlist_for_class.add_field("scores", scores_j) + boxlist_for_class = boxlist_nms( + boxlist_for_class, self.nms_thresh, + score_field="scores" + ) + num_labels = len(boxlist_for_class) + boxlist_for_class.add_field( + "labels", torch.full((num_labels,), j, + dtype=torch.int64, + device=scores.device) + ) + result.append(boxlist_for_class) + + result = cat_boxlist(result) + number_of_detections = len(result) + + # Limit to max_per_image detections **over all classes** + if number_of_detections > self.fpn_post_nms_top_n > 0: + cls_scores = result.get_field("scores") + image_thresh, _ = torch.kthvalue( + cls_scores.cpu(), + number_of_detections - self.fpn_post_nms_top_n + 1 + ) + keep = cls_scores >= image_thresh.item() + keep = torch.nonzero(keep).squeeze(1) + result = result[keep] + results.append(result) + return results + + +def make_retinanet_postprocessor(config, rpn_box_coder, is_train): + pre_nms_thresh = config.MODEL.RETINANET.INFERENCE_TH + pre_nms_top_n = config.MODEL.RETINANET.PRE_NMS_TOP_N + nms_thresh = config.MODEL.RETINANET.NMS_TH + fpn_post_nms_top_n = config.TEST.DETECTIONS_PER_IMG + min_size = 0 + + box_selector = RetinaNetPostProcessor( + pre_nms_thresh=pre_nms_thresh, + pre_nms_top_n=pre_nms_top_n, + nms_thresh=nms_thresh, + fpn_post_nms_top_n=fpn_post_nms_top_n, + min_size=min_size, + num_classes=config.MODEL.RETINANET.NUM_CLASSES, + box_coder=rpn_box_coder, + ) + + return box_selector diff --git a/maskrcnn_benchmark/modeling/rpn/retinanet/loss.py b/maskrcnn_benchmark/modeling/rpn/retinanet/loss.py new file mode 100644 index 0000000000000000000000000000000000000000..080e2153ba59e90e620f30a5adc5426a1551e4e8 --- /dev/null +++ b/maskrcnn_benchmark/modeling/rpn/retinanet/loss.py @@ -0,0 +1,107 @@ +""" +This file contains specific functions for computing losses on the RetinaNet +file +""" + +import torch +from torch.nn import functional as F + +from ..utils import concat_box_prediction_layers + +from maskrcnn_benchmark.layers import smooth_l1_loss +from maskrcnn_benchmark.layers import SigmoidFocalLoss +from maskrcnn_benchmark.modeling.matcher import Matcher +from maskrcnn_benchmark.modeling.utils import cat +from maskrcnn_benchmark.structures.boxlist_ops import boxlist_iou +from maskrcnn_benchmark.structures.boxlist_ops import cat_boxlist +from maskrcnn_benchmark.modeling.rpn.loss import RPNLossComputation + +class RetinaNetLossComputation(RPNLossComputation): + """ + This class computes the RetinaNet loss. + """ + + def __init__(self, proposal_matcher, box_coder, + generate_labels_func, + sigmoid_focal_loss, + bbox_reg_beta=0.11, + regress_norm=1.0): + """ + Arguments: + proposal_matcher (Matcher) + box_coder (BoxCoder) + """ + self.proposal_matcher = proposal_matcher + self.box_coder = box_coder + self.box_cls_loss_func = sigmoid_focal_loss + self.bbox_reg_beta = bbox_reg_beta + self.copied_fields = ['labels'] + self.generate_labels_func = generate_labels_func + self.discard_cases = ['between_thresholds'] + self.regress_norm = regress_norm + + def __call__(self, anchors, box_cls, box_regression, targets): + """ + Arguments: + anchors (list[BoxList]) + box_cls (list[Tensor]) + box_regression (list[Tensor]) + targets (list[BoxList]) + + Returns: + retinanet_cls_loss (Tensor) + retinanet_regression_loss (Tensor + """ + anchors = [cat_boxlist(anchors_per_image) for anchors_per_image in anchors] + labels, regression_targets = self.prepare_targets(anchors, targets) + + N = len(labels) + box_cls, box_regression = \ + concat_box_prediction_layers(box_cls, box_regression) + + labels = torch.cat(labels, dim=0) + regression_targets = torch.cat(regression_targets, dim=0) + pos_inds = torch.nonzero(labels > 0).squeeze(1) + + retinanet_regression_loss = smooth_l1_loss( + box_regression[pos_inds], + regression_targets[pos_inds], + beta=self.bbox_reg_beta, + size_average=False, + ) / (max(1, pos_inds.numel() * self.regress_norm)) + + labels = labels.int() + + retinanet_cls_loss = self.box_cls_loss_func( + box_cls, + labels + ) / (pos_inds.numel() + N) + + return retinanet_cls_loss, retinanet_regression_loss + + +def generate_retinanet_labels(matched_targets): + labels_per_image = matched_targets.get_field("labels") + return labels_per_image + + +def make_retinanet_loss_evaluator(cfg, box_coder): + matcher = Matcher( + cfg.MODEL.RETINANET.FG_IOU_THRESHOLD, + cfg.MODEL.RETINANET.BG_IOU_THRESHOLD, + allow_low_quality_matches=True, + ) + sigmoid_focal_loss = SigmoidFocalLoss( + cfg.MODEL.RETINANET.LOSS_GAMMA, + cfg.MODEL.RETINANET.LOSS_ALPHA + ) + + loss_evaluator = RetinaNetLossComputation( + matcher, + box_coder, + generate_retinanet_labels, + sigmoid_focal_loss, + bbox_reg_beta = cfg.MODEL.RETINANET.BBOX_REG_BETA, + regress_norm = cfg.MODEL.RETINANET.BBOX_REG_WEIGHT, + ) + return loss_evaluator diff --git a/maskrcnn_benchmark/modeling/rpn/retinanet/retinanet.py b/maskrcnn_benchmark/modeling/rpn/retinanet/retinanet.py new file mode 100644 index 0000000000000000000000000000000000000000..1599b29b2e9bbb626b31d652022fbbd034bf5e30 --- /dev/null +++ b/maskrcnn_benchmark/modeling/rpn/retinanet/retinanet.py @@ -0,0 +1,152 @@ +import math +import torch +import torch.nn.functional as F +from torch import nn + +from .inference import make_retinanet_postprocessor +from .loss import make_retinanet_loss_evaluator +from ..anchor_generator import make_anchor_generator_retinanet + +from maskrcnn_benchmark.modeling.box_coder import BoxCoder + + +class RetinaNetHead(torch.nn.Module): + """ + Adds a RetinNet head with classification and regression heads + """ + + def __init__(self, cfg, in_channels): + """ + Arguments: + in_channels (int): number of channels of the input feature + num_anchors (int): number of anchors to be predicted + """ + super(RetinaNetHead, self).__init__() + # TODO: Implement the sigmoid version first. + num_classes = cfg.MODEL.RETINANET.NUM_CLASSES - 1 + num_anchors = len(cfg.MODEL.RETINANET.ASPECT_RATIOS) \ + * cfg.MODEL.RETINANET.SCALES_PER_OCTAVE + + cls_tower = [] + bbox_tower = [] + for i in range(cfg.MODEL.RETINANET.NUM_CONVS): + cls_tower.append( + nn.Conv2d( + in_channels, + in_channels, + kernel_size=3, + stride=1, + padding=1 + ) + ) + cls_tower.append(nn.ReLU()) + bbox_tower.append( + nn.Conv2d( + in_channels, + in_channels, + kernel_size=3, + stride=1, + padding=1 + ) + ) + bbox_tower.append(nn.ReLU()) + + self.add_module('cls_tower', nn.Sequential(*cls_tower)) + self.add_module('bbox_tower', nn.Sequential(*bbox_tower)) + self.cls_logits = nn.Conv2d( + in_channels, num_anchors * num_classes, kernel_size=3, stride=1, + padding=1 + ) + self.bbox_pred = nn.Conv2d( + in_channels, num_anchors * 4, kernel_size=3, stride=1, + padding=1 + ) + + # Initialization + for modules in [self.cls_tower, self.bbox_tower, self.cls_logits, + self.bbox_pred]: + for l in modules.modules(): + if isinstance(l, nn.Conv2d): + torch.nn.init.normal_(l.weight, std=0.01) + torch.nn.init.constant_(l.bias, 0) + + + # retinanet_bias_init + prior_prob = cfg.MODEL.RETINANET.PRIOR_PROB + bias_value = -math.log((1 - prior_prob) / prior_prob) + torch.nn.init.constant_(self.cls_logits.bias, bias_value) + + def forward(self, x): + logits = [] + bbox_reg = [] + for feature in x: + logits.append(self.cls_logits(self.cls_tower(feature))) + bbox_reg.append(self.bbox_pred(self.bbox_tower(feature))) + return logits, bbox_reg + + +class RetinaNetModule(torch.nn.Module): + """ + Module for RetinaNet computation. Takes feature maps from the backbone and + RetinaNet outputs and losses. Only Test on FPN now. + """ + + def __init__(self, cfg, in_channels): + super(RetinaNetModule, self).__init__() + + self.cfg = cfg.clone() + + anchor_generator = make_anchor_generator_retinanet(cfg) + head = RetinaNetHead(cfg, in_channels) + box_coder = BoxCoder(weights=(10., 10., 5., 5.)) + + box_selector_test = make_retinanet_postprocessor(cfg, box_coder, is_train=False) + + loss_evaluator = make_retinanet_loss_evaluator(cfg, box_coder) + + self.anchor_generator = anchor_generator + self.head = head + self.box_selector_test = box_selector_test + self.loss_evaluator = loss_evaluator + + def forward(self, images, features, targets=None): + """ + Arguments: + images (ImageList): images for which we want to compute the predictions + features (list[Tensor]): features computed from the images that are + used for computing the predictions. Each tensor in the list + correspond to different feature levels + targets (list[BoxList): ground-truth boxes present in the image (optional) + + Returns: + boxes (list[BoxList]): the predicted boxes from the RPN, one BoxList per + image. + losses (dict[Tensor]): the losses for the model during training. During + testing, it is an empty dict. + """ + box_cls, box_regression = self.head(features) + anchors = self.anchor_generator(images, features) + + if self.training: + return self._forward_train(anchors, box_cls, box_regression, targets) + else: + return self._forward_test(anchors, box_cls, box_regression) + + def _forward_train(self, anchors, box_cls, box_regression, targets): + + loss_box_cls, loss_box_reg = self.loss_evaluator( + anchors, box_cls, box_regression, targets + ) + losses = { + "loss_retina_cls": loss_box_cls, + "loss_retina_reg": loss_box_reg, + } + return anchors, losses + + def _forward_test(self, anchors, box_cls, box_regression): + boxes = self.box_selector_test(anchors, box_cls, box_regression) + return boxes, {} + + +def build_retinanet(cfg, in_channels): + return RetinaNetModule(cfg, in_channels) diff --git a/maskrcnn_benchmark/modeling/rpn/rpn.py b/maskrcnn_benchmark/modeling/rpn/rpn.py new file mode 100644 index 0000000000000000000000000000000000000000..07997651cb3d0e2f9e90f26e7ec47c78e04c2073 --- /dev/null +++ b/maskrcnn_benchmark/modeling/rpn/rpn.py @@ -0,0 +1,207 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import torch +import torch.nn.functional as F +from torch import nn + +from maskrcnn_benchmark.modeling import registry +from maskrcnn_benchmark.modeling.box_coder import BoxCoder +from maskrcnn_benchmark.modeling.rpn.retinanet.retinanet import build_retinanet +from .loss import make_rpn_loss_evaluator +from .anchor_generator import make_anchor_generator +from .inference import make_rpn_postprocessor + + +class RPNHeadConvRegressor(nn.Module): + """ + A simple RPN Head for classification and bbox regression + """ + + def __init__(self, cfg, in_channels, num_anchors): + """ + Arguments: + cfg : config + in_channels (int): number of channels of the input feature + num_anchors (int): number of anchors to be predicted + """ + super(RPNHeadConvRegressor, self).__init__() + self.cls_logits = nn.Conv2d(in_channels, num_anchors, kernel_size=1, stride=1) + self.bbox_pred = nn.Conv2d( + in_channels, num_anchors * 4, kernel_size=1, stride=1 + ) + + for l in [self.cls_logits, self.bbox_pred]: + torch.nn.init.normal_(l.weight, std=0.01) + torch.nn.init.constant_(l.bias, 0) + + def forward(self, x): + assert isinstance(x, (list, tuple)) + logits = [self.cls_logits(y) for y in x] + bbox_reg = [self.bbox_pred(y) for y in x] + + return logits, bbox_reg + + +class RPNHeadFeatureSingleConv(nn.Module): + """ + Adds a simple RPN Head with one conv to extract the feature + """ + + def __init__(self, cfg, in_channels): + """ + Arguments: + cfg : config + in_channels (int): number of channels of the input feature + """ + super(RPNHeadFeatureSingleConv, self).__init__() + self.conv = nn.Conv2d( + in_channels, in_channels, kernel_size=3, stride=1, padding=1 + ) + + for l in [self.conv]: + torch.nn.init.normal_(l.weight, std=0.01) + torch.nn.init.constant_(l.bias, 0) + + self.out_channels = in_channels + + def forward(self, x): + assert isinstance(x, (list, tuple)) + x = [F.relu(self.conv(z)) for z in x] + + return x + + +@registry.RPN_HEADS.register("SingleConvRPNHead") +class RPNHead(nn.Module): + """ + Adds a simple RPN Head with classification and regression heads + """ + + def __init__(self, cfg, in_channels, num_anchors): + """ + Arguments: + cfg : config + in_channels (int): number of channels of the input feature + num_anchors (int): number of anchors to be predicted + """ + super(RPNHead, self).__init__() + self.conv = nn.Conv2d( + in_channels, in_channels, kernel_size=3, stride=1, padding=1 + ) + self.cls_logits = nn.Conv2d(in_channels, num_anchors, kernel_size=1, stride=1) + self.bbox_pred = nn.Conv2d( + in_channels, num_anchors * 4, kernel_size=1, stride=1 + ) + + for l in [self.conv, self.cls_logits, self.bbox_pred]: + torch.nn.init.normal_(l.weight, std=0.01) + torch.nn.init.constant_(l.bias, 0) + + def forward(self, x): + logits = [] + bbox_reg = [] + for feature in x: + t = F.relu(self.conv(feature)) + logits.append(self.cls_logits(t)) + bbox_reg.append(self.bbox_pred(t)) + return logits, bbox_reg + + +class RPNModule(torch.nn.Module): + """ + Module for RPN computation. Takes feature maps from the backbone and RPN + proposals and losses. Works for both FPN and non-FPN. + """ + + def __init__(self, cfg, in_channels): + super(RPNModule, self).__init__() + + self.cfg = cfg.clone() + + anchor_generator = make_anchor_generator(cfg) + + rpn_head = registry.RPN_HEADS[cfg.MODEL.RPN.RPN_HEAD] + head = rpn_head( + cfg, in_channels, anchor_generator.num_anchors_per_location()[0] + ) + + rpn_box_coder = BoxCoder(weights=(1.0, 1.0, 1.0, 1.0)) + + box_selector_train = make_rpn_postprocessor(cfg, rpn_box_coder, is_train=True) + box_selector_test = make_rpn_postprocessor(cfg, rpn_box_coder, is_train=False) + + loss_evaluator = make_rpn_loss_evaluator(cfg, rpn_box_coder) + + self.anchor_generator = anchor_generator + self.head = head + self.box_selector_train = box_selector_train + self.box_selector_test = box_selector_test + self.loss_evaluator = loss_evaluator + + def forward(self, images, features, targets=None): + """ + Arguments: + images (ImageList): images for which we want to compute the predictions + features (list[Tensor]): features computed from the images that are + used for computing the predictions. Each tensor in the list + correspond to different feature levels + targets (list[BoxList): ground-truth boxes present in the image (optional) + + Returns: + boxes (list[BoxList]): the predicted boxes from the RPN, one BoxList per + image. + losses (dict[Tensor]): the losses for the model during training. During + testing, it is an empty dict. + """ + objectness, rpn_box_regression = self.head(features) + anchors = self.anchor_generator(images, features) + + if self.training: + return self._forward_train(anchors, objectness, rpn_box_regression, targets) + else: + return self._forward_test(anchors, objectness, rpn_box_regression) + + def _forward_train(self, anchors, objectness, rpn_box_regression, targets): + if self.cfg.MODEL.RPN_ONLY: + # When training an RPN-only model, the loss is determined by the + # predicted objectness and rpn_box_regression values and there is + # no need to transform the anchors into predicted boxes; this is an + # optimization that avoids the unnecessary transformation. + boxes = anchors + else: + # For end-to-end models, anchors must be transformed into boxes and + # sampled into a training batch. + with torch.no_grad(): + boxes = self.box_selector_train( + anchors, objectness, rpn_box_regression, targets + ) + loss_objectness, loss_rpn_box_reg = self.loss_evaluator( + anchors, objectness, rpn_box_regression, targets + ) + losses = { + "loss_objectness": loss_objectness, + "loss_rpn_box_reg": loss_rpn_box_reg, + } + return boxes, losses + + def _forward_test(self, anchors, objectness, rpn_box_regression): + boxes = self.box_selector_test(anchors, objectness, rpn_box_regression) + if self.cfg.MODEL.RPN_ONLY: + # For end-to-end models, the RPN proposals are an intermediate state + # and don't bother to sort them in decreasing score order. For RPN-only + # models, the proposals are the final output and we return them in + # high-to-low confidence order. + inds = [ + box.get_field("objectness").sort(descending=True)[1] for box in boxes + ] + boxes = [box[ind] for box, ind in zip(boxes, inds)] + return boxes, {} + + +def build_rpn(cfg, in_channels): + """ + This gives the gist of it. Not super important because it doesn't change as much + """ + if cfg.MODEL.RETINANET_ON: + return build_retinanet(cfg, in_channels) + + return RPNModule(cfg, in_channels) diff --git a/maskrcnn_benchmark/modeling/rpn/utils.py b/maskrcnn_benchmark/modeling/rpn/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..37a9ca6f228ac4315746a234533edbdd9fa73569 --- /dev/null +++ b/maskrcnn_benchmark/modeling/rpn/utils.py @@ -0,0 +1,45 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +""" +Utility functions minipulating the prediction layers +""" + +from ..utils import cat + +import torch + +def permute_and_flatten(layer, N, A, C, H, W): + layer = layer.view(N, -1, C, H, W) + layer = layer.permute(0, 3, 4, 1, 2) + layer = layer.reshape(N, -1, C) + return layer + + +def concat_box_prediction_layers(box_cls, box_regression): + box_cls_flattened = [] + box_regression_flattened = [] + # for each feature level, permute the outputs to make them be in the + # same format as the labels. Note that the labels are computed for + # all feature levels concatenated, so we keep the same representation + # for the objectness and the box_regression + for box_cls_per_level, box_regression_per_level in zip( + box_cls, box_regression + ): + N, AxC, H, W = box_cls_per_level.shape + Ax4 = box_regression_per_level.shape[1] + A = Ax4 // 4 + C = AxC // A + box_cls_per_level = permute_and_flatten( + box_cls_per_level, N, A, C, H, W + ) + box_cls_flattened.append(box_cls_per_level) + + box_regression_per_level = permute_and_flatten( + box_regression_per_level, N, A, 4, H, W + ) + box_regression_flattened.append(box_regression_per_level) + # concatenate on the first dimension (representing the feature levels), to + # take into account the way the labels were generated (with all feature maps + # being concatenated as well) + box_cls = cat(box_cls_flattened, dim=1).reshape(-1, C) + box_regression = cat(box_regression_flattened, dim=1).reshape(-1, 4) + return box_cls, box_regression diff --git a/maskrcnn_benchmark/modeling/utils.py b/maskrcnn_benchmark/modeling/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..5b1d79a812ab3db034cf817583281c006b11b90a --- /dev/null +++ b/maskrcnn_benchmark/modeling/utils.py @@ -0,0 +1,16 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +""" +Miscellaneous utility functions +""" + +import torch + + +def cat(tensors, dim=0): + """ + Efficient version of torch.cat that avoids a copy if there is only a single element in a list + """ + assert isinstance(tensors, (list, tuple)) + if len(tensors) == 1: + return tensors[0] + return torch.cat(tensors, dim) diff --git a/maskrcnn_benchmark/solver/__init__.py b/maskrcnn_benchmark/solver/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..75f40530cccb6b989d33193de92a6c26a07cf751 --- /dev/null +++ b/maskrcnn_benchmark/solver/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +from .build import make_optimizer +from .build import make_lr_scheduler +from .lr_scheduler import WarmupMultiStepLR diff --git a/maskrcnn_benchmark/solver/build.py b/maskrcnn_benchmark/solver/build.py new file mode 100644 index 0000000000000000000000000000000000000000..e6f07b5189464775ab4150e22dce5885006e37db --- /dev/null +++ b/maskrcnn_benchmark/solver/build.py @@ -0,0 +1,34 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import torch + +from .lr_scheduler import WarmupMultiStepLR + + +def make_optimizer(cfg, model): + params = [] + for key, value in model.named_parameters(): + if not value.requires_grad: + continue + lr = cfg.SOLVER.BASE_LR + weight_decay = cfg.SOLVER.WEIGHT_DECAY + if 'heatmap_head' in key: + lr = lr * 1e-5 * 1 + weight_decay = 5e-4 + elif "bias" in key: + lr = cfg.SOLVER.BASE_LR * cfg.SOLVER.BIAS_LR_FACTOR + weight_decay = cfg.SOLVER.WEIGHT_DECAY_BIAS + params += [{"params": [value], "lr": lr, "weight_decay": weight_decay}] + + optimizer = torch.optim.SGD(params, lr, momentum=cfg.SOLVER.MOMENTUM) + return optimizer + + +def make_lr_scheduler(cfg, optimizer): + return WarmupMultiStepLR( + optimizer, + cfg.SOLVER.STEPS, + cfg.SOLVER.GAMMA, + warmup_factor=cfg.SOLVER.WARMUP_FACTOR, + warmup_iters=cfg.SOLVER.WARMUP_ITERS, + warmup_method=cfg.SOLVER.WARMUP_METHOD, + ) diff --git a/maskrcnn_benchmark/solver/lr_scheduler.py b/maskrcnn_benchmark/solver/lr_scheduler.py new file mode 100644 index 0000000000000000000000000000000000000000..d7d45b6c6f98e66a5da5b8b84a50258a517bb1e4 --- /dev/null +++ b/maskrcnn_benchmark/solver/lr_scheduler.py @@ -0,0 +1,52 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +from bisect import bisect_right + +import torch + + +# FIXME ideally this would be achieved with a CombinedLRScheduler, +# separating MultiStepLR with WarmupLR +# but the current LRScheduler design doesn't allow it +class WarmupMultiStepLR(torch.optim.lr_scheduler._LRScheduler): + def __init__( + self, + optimizer, + milestones, + gamma=0.1, + warmup_factor=1.0 / 3, + warmup_iters=500, + warmup_method="linear", + last_epoch=-1, + ): + if not list(milestones) == sorted(milestones): + raise ValueError( + "Milestones should be a list of" " increasing integers. Got {}", + milestones, + ) + + if warmup_method not in ("constant", "linear"): + raise ValueError( + "Only 'constant' or 'linear' warmup_method accepted" + "got {}".format(warmup_method) + ) + self.milestones = milestones + self.gamma = gamma + self.warmup_factor = warmup_factor + self.warmup_iters = warmup_iters + self.warmup_method = warmup_method + super(WarmupMultiStepLR, self).__init__(optimizer, last_epoch) + + def get_lr(self): + warmup_factor = 1 + if self.last_epoch < self.warmup_iters: + if self.warmup_method == "constant": + warmup_factor = self.warmup_factor + elif self.warmup_method == "linear": + alpha = float(self.last_epoch) / self.warmup_iters + warmup_factor = self.warmup_factor * (1 - alpha) + alpha + return [ + base_lr + * warmup_factor + * self.gamma ** bisect_right(self.milestones, self.last_epoch) + for base_lr in self.base_lrs + ] diff --git a/maskrcnn_benchmark/structures/__init__.py b/maskrcnn_benchmark/structures/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/maskrcnn_benchmark/structures/bounding_box.py b/maskrcnn_benchmark/structures/bounding_box.py new file mode 100644 index 0000000000000000000000000000000000000000..4084024fa55aadc4161b7ee4ae8644eca62b26e1 --- /dev/null +++ b/maskrcnn_benchmark/structures/bounding_box.py @@ -0,0 +1,266 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import torch + +# transpose +FLIP_LEFT_RIGHT = 0 +FLIP_TOP_BOTTOM = 1 + + +class BoxList(object): + """ + This class represents a set of bounding boxes. + The bounding boxes are represented as a Nx4 Tensor. + In order to uniquely determine the bounding boxes with respect + to an image, we also store the corresponding image dimensions. + They can contain extra information that is specific to each bounding box, such as + labels. + """ + + def __init__(self, bbox, image_size, mode="xyxy"): + device = bbox.device if isinstance(bbox, torch.Tensor) else torch.device("cpu") + bbox = torch.as_tensor(bbox, dtype=torch.float32, device=device) + if bbox.ndimension() != 2: + raise ValueError( + "bbox should have 2 dimensions, got {}".format(bbox.ndimension()) + ) + if bbox.size(-1) != 4: + raise ValueError( + "last dimension of bbox should have a " + "size of 4, got {}".format(bbox.size(-1)) + ) + if mode not in ("xyxy", "xywh"): + raise ValueError("mode should be 'xyxy' or 'xywh'") + + self.bbox = bbox + self.size = image_size # (image_width, image_height) + self.mode = mode + self.extra_fields = {} + + def add_field(self, field, field_data): + self.extra_fields[field] = field_data + + def get_field(self, field): + return self.extra_fields[field] + + def has_field(self, field): + return field in self.extra_fields + + def fields(self): + return list(self.extra_fields.keys()) + + def _copy_extra_fields(self, bbox): + for k, v in bbox.extra_fields.items(): + self.extra_fields[k] = v + + def convert(self, mode): + if mode not in ("xyxy", "xywh"): + raise ValueError("mode should be 'xyxy' or 'xywh'") + if mode == self.mode: + return self + # we only have two modes, so don't need to check + # self.mode + xmin, ymin, xmax, ymax = self._split_into_xyxy() + if mode == "xyxy": + bbox = torch.cat((xmin, ymin, xmax, ymax), dim=-1) + bbox = BoxList(bbox, self.size, mode=mode) + else: + TO_REMOVE = 1 + bbox = torch.cat( + (xmin, ymin, xmax - xmin + TO_REMOVE, ymax - ymin + TO_REMOVE), dim=-1 + ) + bbox = BoxList(bbox, self.size, mode=mode) + bbox._copy_extra_fields(self) + return bbox + + def _split_into_xyxy(self): + if self.mode == "xyxy": + xmin, ymin, xmax, ymax = self.bbox.split(1, dim=-1) + return xmin, ymin, xmax, ymax + elif self.mode == "xywh": + TO_REMOVE = 1 + xmin, ymin, w, h = self.bbox.split(1, dim=-1) + return ( + xmin, + ymin, + xmin + (w - TO_REMOVE).clamp(min=0), + ymin + (h - TO_REMOVE).clamp(min=0), + ) + else: + raise RuntimeError("Should not be here") + + def resize(self, size, *args, **kwargs): + """ + Returns a resized copy of this bounding box + + :param size: The requested size in pixels, as a 2-tuple: + (width, height). + """ + + ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(size, self.size)) + if ratios[0] == ratios[1]: + ratio = ratios[0] + scaled_box = self.bbox * ratio + bbox = BoxList(scaled_box, size, mode=self.mode) + # bbox._copy_extra_fields(self) + for k, v in self.extra_fields.items(): + if not isinstance(v, torch.Tensor): + v = v.resize(size, *args, **kwargs) + bbox.add_field(k, v) + return bbox + + ratio_width, ratio_height = ratios + xmin, ymin, xmax, ymax = self._split_into_xyxy() + scaled_xmin = xmin * ratio_width + scaled_xmax = xmax * ratio_width + scaled_ymin = ymin * ratio_height + scaled_ymax = ymax * ratio_height + scaled_box = torch.cat( + (scaled_xmin, scaled_ymin, scaled_xmax, scaled_ymax), dim=-1 + ) + bbox = BoxList(scaled_box, size, mode="xyxy") + # bbox._copy_extra_fields(self) + for k, v in self.extra_fields.items(): + if not isinstance(v, torch.Tensor): + v = v.resize(size, *args, **kwargs) + bbox.add_field(k, v) + + return bbox.convert(self.mode) + + def transpose(self, method): + """ + Transpose bounding box (flip or rotate in 90 degree steps) + :param method: One of :py:attr:`PIL.Image.FLIP_LEFT_RIGHT`, + :py:attr:`PIL.Image.FLIP_TOP_BOTTOM`, :py:attr:`PIL.Image.ROTATE_90`, + :py:attr:`PIL.Image.ROTATE_180`, :py:attr:`PIL.Image.ROTATE_270`, + :py:attr:`PIL.Image.TRANSPOSE` or :py:attr:`PIL.Image.TRANSVERSE`. + """ + if method not in (FLIP_LEFT_RIGHT, FLIP_TOP_BOTTOM): + raise NotImplementedError( + "Only FLIP_LEFT_RIGHT and FLIP_TOP_BOTTOM implemented" + ) + + image_width, image_height = self.size + xmin, ymin, xmax, ymax = self._split_into_xyxy() + if method == FLIP_LEFT_RIGHT: + TO_REMOVE = 1 + transposed_xmin = image_width - xmax - TO_REMOVE + transposed_xmax = image_width - xmin - TO_REMOVE + transposed_ymin = ymin + transposed_ymax = ymax + elif method == FLIP_TOP_BOTTOM: + transposed_xmin = xmin + transposed_xmax = xmax + transposed_ymin = image_height - ymax + transposed_ymax = image_height - ymin + + transposed_boxes = torch.cat( + (transposed_xmin, transposed_ymin, transposed_xmax, transposed_ymax), dim=-1 + ) + bbox = BoxList(transposed_boxes, self.size, mode="xyxy") + # bbox._copy_extra_fields(self) + for k, v in self.extra_fields.items(): + if not isinstance(v, torch.Tensor): + v = v.transpose(method) + bbox.add_field(k, v) + return bbox.convert(self.mode) + + def crop(self, box): + """ + Cropss a rectangular region from this bounding box. The box is a + 4-tuple defining the left, upper, right, and lower pixel + coordinate. + """ + xmin, ymin, xmax, ymax = self._split_into_xyxy() + w, h = box[2] - box[0], box[3] - box[1] + cropped_xmin = (xmin - box[0]).clamp(min=0, max=w) + cropped_ymin = (ymin - box[1]).clamp(min=0, max=h) + cropped_xmax = (xmax - box[0]).clamp(min=0, max=w) + cropped_ymax = (ymax - box[1]).clamp(min=0, max=h) + + # TODO should I filter empty boxes here? + if False: + is_empty = (cropped_xmin == cropped_xmax) | (cropped_ymin == cropped_ymax) + + cropped_box = torch.cat( + (cropped_xmin, cropped_ymin, cropped_xmax, cropped_ymax), dim=-1 + ) + bbox = BoxList(cropped_box, (w, h), mode="xyxy") + # bbox._copy_extra_fields(self) + for k, v in self.extra_fields.items(): + if not isinstance(v, torch.Tensor): + v = v.crop(box) + bbox.add_field(k, v) + return bbox.convert(self.mode) + + # Tensor-like methods + + def to(self, device): + bbox = BoxList(self.bbox.to(device), self.size, self.mode) + for k, v in self.extra_fields.items(): + if hasattr(v, "to"): + v = v.to(device) + bbox.add_field(k, v) + return bbox + + def __getitem__(self, item): + bbox = BoxList(self.bbox[item], self.size, self.mode) + for k, v in self.extra_fields.items(): + bbox.add_field(k, v[item]) + return bbox + + def __len__(self): + return self.bbox.shape[0] + + def clip_to_image(self, remove_empty=True): + TO_REMOVE = 1 + self.bbox[:, 0].clamp_(min=0, max=self.size[0] - TO_REMOVE) + self.bbox[:, 1].clamp_(min=0, max=self.size[1] - TO_REMOVE) + self.bbox[:, 2].clamp_(min=0, max=self.size[0] - TO_REMOVE) + self.bbox[:, 3].clamp_(min=0, max=self.size[1] - TO_REMOVE) + if remove_empty: + box = self.bbox + keep = (box[:, 3] > box[:, 1]) & (box[:, 2] > box[:, 0]) + return self[keep] + return self + + def area(self): + box = self.bbox + if self.mode == "xyxy": + TO_REMOVE = 1 + area = (box[:, 2] - box[:, 0] + TO_REMOVE) * (box[:, 3] - box[:, 1] + TO_REMOVE) + elif self.mode == "xywh": + area = box[:, 2] * box[:, 3] + else: + raise RuntimeError("Should not be here") + + return area + + def copy_with_fields(self, fields, skip_missing=False): + bbox = BoxList(self.bbox, self.size, self.mode) + if not isinstance(fields, (list, tuple)): + fields = [fields] + for field in fields: + if self.has_field(field): + bbox.add_field(field, self.get_field(field)) + elif not skip_missing: + raise KeyError("Field '{}' not found in {}".format(field, self)) + return bbox + + def __repr__(self): + s = self.__class__.__name__ + "(" + s += "num_boxes={}, ".format(len(self)) + s += "image_width={}, ".format(self.size[0]) + s += "image_height={}, ".format(self.size[1]) + s += "mode={})".format(self.mode) + return s + + +if __name__ == "__main__": + bbox = BoxList([[0, 0, 10, 10], [0, 0, 5, 5]], (10, 10)) + s_bbox = bbox.resize((5, 5)) + print(s_bbox) + print(s_bbox.bbox) + + t_bbox = bbox.transpose(0) + print(t_bbox) + print(t_bbox.bbox) diff --git a/maskrcnn_benchmark/structures/boxlist_ops.py b/maskrcnn_benchmark/structures/boxlist_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..dc51212f4ff7abc6d978df75d3de44f956f38f67 --- /dev/null +++ b/maskrcnn_benchmark/structures/boxlist_ops.py @@ -0,0 +1,128 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import torch + +from .bounding_box import BoxList + +from maskrcnn_benchmark.layers import nms as _box_nms + + +def boxlist_nms(boxlist, nms_thresh, max_proposals=-1, score_field="scores"): + """ + Performs non-maximum suppression on a boxlist, with scores specified + in a boxlist field via score_field. + + Arguments: + boxlist(BoxList) + nms_thresh (float) + max_proposals (int): if > 0, then only the top max_proposals are kept + after non-maximum suppression + score_field (str) + """ + if nms_thresh <= 0: + return boxlist + mode = boxlist.mode + boxlist = boxlist.convert("xyxy") + boxes = boxlist.bbox + score = boxlist.get_field(score_field) + keep = _box_nms(boxes, score, nms_thresh) + if max_proposals > 0: + keep = keep[: max_proposals] + boxlist = boxlist[keep] + return boxlist.convert(mode) + + +def remove_small_boxes(boxlist, min_size): + """ + Only keep boxes with both sides >= min_size + + Arguments: + boxlist (Boxlist) + min_size (int) + """ + # TODO maybe add an API for querying the ws / hs + xywh_boxes = boxlist.convert("xywh").bbox + _, _, ws, hs = xywh_boxes.unbind(dim=1) + keep = ( + (ws >= min_size) & (hs >= min_size) + ).nonzero().squeeze(1) + return boxlist[keep] + + +# implementation from https://github.com/kuangliu/torchcv/blob/master/torchcv/utils/box.py +# with slight modifications +def boxlist_iou(boxlist1, boxlist2): + """Compute the intersection over union of two set of boxes. + The box order must be (xmin, ymin, xmax, ymax). + + Arguments: + box1: (BoxList) bounding boxes, sized [N,4]. + box2: (BoxList) bounding boxes, sized [M,4]. + + Returns: + (tensor) iou, sized [N,M]. + + Reference: + https://github.com/chainer/chainercv/blob/master/chainercv/utils/bbox/bbox_iou.py + """ + if boxlist1.size != boxlist2.size: + raise RuntimeError( + "boxlists should have same image size, got {}, {}".format(boxlist1, boxlist2)) + + N = len(boxlist1) + M = len(boxlist2) + + area1 = boxlist1.area() + area2 = boxlist2.area() + + box1, box2 = boxlist1.bbox, boxlist2.bbox + + lt = torch.max(box1[:, None, :2], box2[:, :2]) # [N,M,2] + rb = torch.min(box1[:, None, 2:], box2[:, 2:]) # [N,M,2] + + TO_REMOVE = 1 + + wh = (rb - lt + TO_REMOVE).clamp(min=0) # [N,M,2] + inter = wh[:, :, 0] * wh[:, :, 1] # [N,M] + + iou = inter / (area1[:, None] + area2 - inter) + return iou + + +# TODO redundant, remove +def _cat(tensors, dim=0): + """ + Efficient version of torch.cat that avoids a copy if there is only a single element in a list + """ + assert isinstance(tensors, (list, tuple)) + if len(tensors) == 1: + return tensors[0] + return torch.cat(tensors, dim) + + +def cat_boxlist(bboxes): + """ + Concatenates a list of BoxList (having the same image size) into a + single BoxList + + Arguments: + bboxes (list[BoxList]) + """ + assert isinstance(bboxes, (list, tuple)) + assert all(isinstance(bbox, BoxList) for bbox in bboxes) + + size = bboxes[0].size + assert all(bbox.size == size for bbox in bboxes) + + mode = bboxes[0].mode + assert all(bbox.mode == mode for bbox in bboxes) + + fields = set(bboxes[0].fields()) + assert all(set(bbox.fields()) == fields for bbox in bboxes) + + cat_boxes = BoxList(_cat([bbox.bbox for bbox in bboxes], dim=0), size, mode) + + for field in fields: + data = _cat([bbox.get_field(field) for bbox in bboxes], dim=0) + cat_boxes.add_field(field, data) + + return cat_boxes diff --git a/maskrcnn_benchmark/structures/image_list.py b/maskrcnn_benchmark/structures/image_list.py new file mode 100644 index 0000000000000000000000000000000000000000..590b87a65a23aa94234022bcc530cb00e1e25b47 --- /dev/null +++ b/maskrcnn_benchmark/structures/image_list.py @@ -0,0 +1,72 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +from __future__ import division + +import torch + + +class ImageList(object): + """ + Structure that holds a list of images (of possibly + varying sizes) as a single tensor. + This works by padding the images to the same size, + and storing in a field the original sizes of each image + """ + + def __init__(self, tensors, image_sizes): + """ + Arguments: + tensors (tensor) + image_sizes (list[tuple[int, int]]) + """ + self.tensors = tensors + self.image_sizes = image_sizes + + def to(self, *args, **kwargs): + cast_tensor = self.tensors.to(*args, **kwargs) + return ImageList(cast_tensor, self.image_sizes) + + +def to_image_list(tensors, size_divisible=0): + """ + tensors can be an ImageList, a torch.Tensor or + an iterable of Tensors. It can't be a numpy array. + When tensors is an iterable of Tensors, it pads + the Tensors with zeros so that they have the same + shape + """ + if isinstance(tensors, torch.Tensor) and size_divisible > 0: + tensors = [tensors] + + if isinstance(tensors, ImageList): + return tensors + elif isinstance(tensors, torch.Tensor): + # single tensor shape can be inferred + if tensors.dim() == 3: + tensors = tensors[None] + assert tensors.dim() == 4 + image_sizes = [tensor.shape[-2:] for tensor in tensors] + return ImageList(tensors, image_sizes) + elif isinstance(tensors, (tuple, list)): + max_size = tuple(max(s) for s in zip(*[img.shape for img in tensors])) + + # TODO Ideally, just remove this and let me model handle arbitrary + # input sizs + if size_divisible > 0: + import math + + stride = size_divisible + max_size = list(max_size) + max_size[1] = int(math.ceil(max_size[1] / stride) * stride) + max_size[2] = int(math.ceil(max_size[2] / stride) * stride) + max_size = tuple(max_size) + + batch_shape = (len(tensors),) + max_size + batched_imgs = tensors[0].new(*batch_shape).zero_() + for img, pad_img in zip(tensors, batched_imgs): + pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) + + image_sizes = [im.shape[-2:] for im in tensors] + + return ImageList(batched_imgs, image_sizes) + else: + raise TypeError("Unsupported type for to_image_list: {}".format(type(tensors))) diff --git a/maskrcnn_benchmark/structures/keypoint.py b/maskrcnn_benchmark/structures/keypoint.py new file mode 100644 index 0000000000000000000000000000000000000000..a6881f72f4f757855105638f2f7a9fca81760bb7 --- /dev/null +++ b/maskrcnn_benchmark/structures/keypoint.py @@ -0,0 +1,188 @@ +import torch + + +# transpose +FLIP_LEFT_RIGHT = 0 +FLIP_TOP_BOTTOM = 1 + +class Keypoints(object): + def __init__(self, keypoints, size, mode=None): + # FIXME remove check once we have better integration with device + # in my version this would consistently return a CPU tensor + device = keypoints.device if isinstance(keypoints, torch.Tensor) else torch.device('cpu') + keypoints = torch.as_tensor(keypoints, dtype=torch.float32, device=device) + num_keypoints = keypoints.shape[0] + if num_keypoints: + keypoints = keypoints.view(num_keypoints, -1, 3) + + # TODO should I split them? + # self.visibility = keypoints[..., 2] + self.keypoints = keypoints# [..., :2] + + self.size = size + self.mode = mode + self.extra_fields = {} + + def crop(self, box): + raise NotImplementedError() + + def resize(self, size, *args, **kwargs): + ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(size, self.size)) + ratio_w, ratio_h = ratios + resized_data = self.keypoints.clone() + resized_data[..., 0] *= ratio_w + resized_data[..., 1] *= ratio_h + keypoints = type(self)(resized_data, size, self.mode) + for k, v in self.extra_fields.items(): + keypoints.add_field(k, v) + return keypoints + + def transpose(self, method): + if method not in (FLIP_LEFT_RIGHT,): + raise NotImplementedError( + "Only FLIP_LEFT_RIGHT implemented") + + flip_inds = type(self).FLIP_INDS + flipped_data = self.keypoints[:, flip_inds] + width = self.size[0] + TO_REMOVE = 1 + # Flip x coordinates + flipped_data[..., 0] = width - flipped_data[..., 0] - TO_REMOVE + + # Maintain COCO convention that if visibility == 0, then x, y = 0 + inds = flipped_data[..., 2] == 0 + flipped_data[inds] = 0 + + keypoints = type(self)(flipped_data, self.size, self.mode) + for k, v in self.extra_fields.items(): + keypoints.add_field(k, v) + return keypoints + + def to(self, *args, **kwargs): + keypoints = type(self)(self.keypoints.to(*args, **kwargs), self.size, self.mode) + for k, v in self.extra_fields.items(): + if hasattr(v, "to"): + v = v.to(*args, **kwargs) + keypoints.add_field(k, v) + return keypoints + + def __getitem__(self, item): + keypoints = type(self)(self.keypoints[item], self.size, self.mode) + for k, v in self.extra_fields.items(): + keypoints.add_field(k, v[item]) + return keypoints + + def add_field(self, field, field_data): + self.extra_fields[field] = field_data + + def get_field(self, field): + return self.extra_fields[field] + + def __repr__(self): + s = self.__class__.__name__ + '(' + s += 'num_instances={}, '.format(len(self.keypoints)) + s += 'image_width={}, '.format(self.size[0]) + s += 'image_height={})'.format(self.size[1]) + return s + + +def _create_flip_indices(names, flip_map): + full_flip_map = flip_map.copy() + full_flip_map.update({v: k for k, v in flip_map.items()}) + flipped_names = [i if i not in full_flip_map else full_flip_map[i] for i in names] + flip_indices = [names.index(i) for i in flipped_names] + return torch.tensor(flip_indices) + + +class PersonKeypoints(Keypoints): + NAMES = [ + 'nose', + 'left_eye', + 'right_eye', + 'left_ear', + 'right_ear', + 'left_shoulder', + 'right_shoulder', + 'left_elbow', + 'right_elbow', + 'left_wrist', + 'right_wrist', + 'left_hip', + 'right_hip', + 'left_knee', + 'right_knee', + 'left_ankle', + 'right_ankle' + ] + FLIP_MAP = { + 'left_eye': 'right_eye', + 'left_ear': 'right_ear', + 'left_shoulder': 'right_shoulder', + 'left_elbow': 'right_elbow', + 'left_wrist': 'right_wrist', + 'left_hip': 'right_hip', + 'left_knee': 'right_knee', + 'left_ankle': 'right_ankle' + } + + +# TODO this doesn't look great +PersonKeypoints.FLIP_INDS = _create_flip_indices(PersonKeypoints.NAMES, PersonKeypoints.FLIP_MAP) +def kp_connections(keypoints): + kp_lines = [ + [keypoints.index('left_eye'), keypoints.index('right_eye')], + [keypoints.index('left_eye'), keypoints.index('nose')], + [keypoints.index('right_eye'), keypoints.index('nose')], + [keypoints.index('right_eye'), keypoints.index('right_ear')], + [keypoints.index('left_eye'), keypoints.index('left_ear')], + [keypoints.index('right_shoulder'), keypoints.index('right_elbow')], + [keypoints.index('right_elbow'), keypoints.index('right_wrist')], + [keypoints.index('left_shoulder'), keypoints.index('left_elbow')], + [keypoints.index('left_elbow'), keypoints.index('left_wrist')], + [keypoints.index('right_hip'), keypoints.index('right_knee')], + [keypoints.index('right_knee'), keypoints.index('right_ankle')], + [keypoints.index('left_hip'), keypoints.index('left_knee')], + [keypoints.index('left_knee'), keypoints.index('left_ankle')], + [keypoints.index('right_shoulder'), keypoints.index('left_shoulder')], + [keypoints.index('right_hip'), keypoints.index('left_hip')], + ] + return kp_lines +PersonKeypoints.CONNECTIONS = kp_connections(PersonKeypoints.NAMES) + + +# TODO make this nicer, this is a direct translation from C2 (but removing the inner loop) +def keypoints_to_heat_map(keypoints, rois, heatmap_size): + if rois.numel() == 0: + return rois.new().long(), rois.new().long() + offset_x = rois[:, 0] + offset_y = rois[:, 1] + scale_x = heatmap_size / (rois[:, 2] - rois[:, 0]) + scale_y = heatmap_size / (rois[:, 3] - rois[:, 1]) + + offset_x = offset_x[:, None] + offset_y = offset_y[:, None] + scale_x = scale_x[:, None] + scale_y = scale_y[:, None] + + x = keypoints[..., 0] + y = keypoints[..., 1] + + x_boundary_inds = x == rois[:, 2][:, None] + y_boundary_inds = y == rois[:, 3][:, None] + + x = (x - offset_x) * scale_x + x = x.floor().long() + y = (y - offset_y) * scale_y + y = y.floor().long() + + x[x_boundary_inds] = heatmap_size - 1 + y[y_boundary_inds] = heatmap_size - 1 + + valid_loc = (x >= 0) & (y >= 0) & (x < heatmap_size) & (y < heatmap_size) + vis = keypoints[..., 2] > 0 + valid = (valid_loc & vis).long() + + lin_ind = y * heatmap_size + x + heatmaps = lin_ind * valid + + return heatmaps, valid diff --git a/maskrcnn_benchmark/structures/segmentation_mask.py b/maskrcnn_benchmark/structures/segmentation_mask.py new file mode 100644 index 0000000000000000000000000000000000000000..8a3f7afefc748f31dbb5add51c071d846bafc4a2 --- /dev/null +++ b/maskrcnn_benchmark/structures/segmentation_mask.py @@ -0,0 +1,291 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import numpy as np +import torch + +import pycocotools.mask as mask_utils + +# transpose +FLIP_LEFT_RIGHT = 0 +FLIP_TOP_BOTTOM = 1 + + +class Mask(object): + """ + This class is unfinished and not meant for use yet + It is supposed to contain the mask for an object as + a 2d tensor + """ + + def __init__(self, masks, size, mode): + self.masks = masks + self.size = size + self.mode = mode + + def transpose(self, method): + if method not in (FLIP_LEFT_RIGHT, FLIP_TOP_BOTTOM): + raise NotImplementedError( + "Only FLIP_LEFT_RIGHT and FLIP_TOP_BOTTOM implemented" + ) + + width, height = self.size + if method == FLIP_LEFT_RIGHT: + dim = width + idx = 2 + elif method == FLIP_TOP_BOTTOM: + dim = height + idx = 1 + + flip_idx = list(range(dim)[::-1]) + flipped_masks = self.masks.index_select(dim, flip_idx) + return Mask(flipped_masks, self.size, self.mode) + + def crop(self, box): + w, h = box[2] - box[0], box[3] - box[1] + + cropped_masks = self.masks[:, box[1]: box[3], box[0]: box[2]] + return Mask(cropped_masks, size=(w, h), mode=self.mode) + + def resize(self, size, *args, **kwargs): + pass + + +class Polygons(object): + """ + This class holds a set of polygons that represents a single instance + of an object mask. The object can be represented as a set of + polygons + """ + + def __init__(self, polygons, size, mode): + # assert isinstance(polygons, list), '{}'.format(polygons) + if isinstance(polygons, list): + polygons = [torch.as_tensor(p, dtype=torch.float32) for p in polygons] + elif isinstance(polygons, Polygons): + polygons = polygons.polygons + + self.polygons = polygons + self.size = size + self.mode = mode + + def transpose(self, method): + if method not in (FLIP_LEFT_RIGHT, FLIP_TOP_BOTTOM): + raise NotImplementedError( + "Only FLIP_LEFT_RIGHT and FLIP_TOP_BOTTOM implemented" + ) + + flipped_polygons = [] + width, height = self.size + if method == FLIP_LEFT_RIGHT: + dim = width + idx = 0 + elif method == FLIP_TOP_BOTTOM: + dim = height + idx = 1 + + for poly in self.polygons: + p = poly.clone() + TO_REMOVE = 1 + p[idx::2] = dim - poly[idx::2] - TO_REMOVE + flipped_polygons.append(p) + + return Polygons(flipped_polygons, size=self.size, mode=self.mode) + + def crop(self, box): + w, h = box[2] - box[0], box[3] - box[1] + + # TODO chck if necessary + w = max(w, 1) + h = max(h, 1) + + cropped_polygons = [] + for poly in self.polygons: + p = poly.clone() + p[0::2] = p[0::2] - box[0] # .clamp(min=0, max=w) + p[1::2] = p[1::2] - box[1] # .clamp(min=0, max=h) + cropped_polygons.append(p) + + return Polygons(cropped_polygons, size=(w, h), mode=self.mode) + + def resize(self, size, *args, **kwargs): + ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(size, self.size)) + if ratios[0] == ratios[1]: + ratio = ratios[0] + scaled_polys = [p * ratio for p in self.polygons] + return Polygons(scaled_polys, size, mode=self.mode) + + ratio_w, ratio_h = ratios + scaled_polygons = [] + for poly in self.polygons: + p = poly.clone() + p[0::2] *= ratio_w + p[1::2] *= ratio_h + scaled_polygons.append(p) + + return Polygons(scaled_polygons, size=size, mode=self.mode) + + def convert(self, mode): + width, height = self.size + if mode == "mask": + rles = mask_utils.frPyObjects( + [p.numpy() for p in self.polygons], height, width + ) + rle = mask_utils.merge(rles) + mask = mask_utils.decode(rle) + mask = torch.from_numpy(mask) + # TODO add squeeze? + return mask + + def __repr__(self): + s = self.__class__.__name__ + "(" + s += "num_polygons={}, ".format(len(self.polygons)) + s += "image_width={}, ".format(self.size[0]) + s += "image_height={}, ".format(self.size[1]) + s += "mode={})".format(self.mode) + return s + + +class SegmentationMask(object): + """ + This class stores the segmentations for all objects in the image + """ + + def __init__(self, polygons, size, mode=None): + """ + Arguments: + polygons: a list of list of lists of numbers. The first + level of the list correspond to individual instances, + the second level to all the polygons that compose the + object, and the third level to the polygon coordinates. + """ + assert isinstance(polygons, list) + + self.polygons = [Polygons(p, size, mode) for p in polygons] + self.size = size + self.mode = mode + + def transpose(self, method): + if method not in (FLIP_LEFT_RIGHT, FLIP_TOP_BOTTOM): + raise NotImplementedError( + "Only FLIP_LEFT_RIGHT and FLIP_TOP_BOTTOM implemented" + ) + + flipped = [] + for polygon in self.polygons: + flipped.append(polygon.transpose(method)) + return SegmentationMask(flipped, size=self.size, mode=self.mode) + + def crop(self, box): + w, h = box[2] - box[0], box[3] - box[1] + cropped = [] + for polygon in self.polygons: + cropped.append(polygon.crop(box)) + return SegmentationMask(cropped, size=(w, h), mode=self.mode) + + def resize(self, size, *args, **kwargs): + scaled = [] + for polygon in self.polygons: + scaled.append(polygon.resize(size, *args, **kwargs)) + return SegmentationMask(scaled, size=size, mode=self.mode) + + def to(self, *args, **kwargs): + return self + + def __getitem__(self, item): + if isinstance(item, (int, slice)): + selected_polygons = [self.polygons[item]] + else: + # advanced indexing on a single dimension + selected_polygons = [] + if isinstance(item, torch.Tensor) and item.dtype == torch.uint8: + item = item.nonzero() + item = item.squeeze(1) if item.numel() > 0 else item + item = item.tolist() + for i in item: + selected_polygons.append(self.polygons[i]) + return SegmentationMask(selected_polygons, size=self.size, mode=self.mode) + + def __iter__(self): + return iter(self.polygons) + + def __repr__(self): + s = self.__class__.__name__ + "(" + s += "num_instances={}, ".format(len(self.polygons)) + s += "image_width={}, ".format(self.size[0]) + s += "image_height={})".format(self.size[1]) + return s + + +class DensityMap(object): + """ + This class is unfinished and not meant for use yet + It is supposed to contain the mask for an object as + a 2d tensor + """ + + def __init__(self, density_map): + # height, width + self.density_map = density_map + + def transpose(self, method): + assert isinstance(self.density_map, np.ndarray) + if method not in (FLIP_LEFT_RIGHT, FLIP_TOP_BOTTOM): + raise NotImplementedError( + "Only FLIP_LEFT_RIGHT and FLIP_TOP_BOTTOM implemented" + ) + if method == FLIP_LEFT_RIGHT: + return DensityMap(np.fliplr(self.density_map)) + elif method == FLIP_TOP_BOTTOM: + return DensityMap(np.flipud(self.density_map)) + + def crop(self, box): + cropped_density_map = self.density_map[box[1]: box[3], box[0]: box[2]] + return DensityMap(cropped_density_map) + + def to(self, *args, **kwargs): + density_map = self.density_map + if isinstance(self.density_map, np.ndarray): + density_map = torch.from_numpy(density_map) + density_map = density_map.to(*args, **kwargs) + return DensityMap(density_map) + + def __getitem__(self, item): + # density map not support indexing + return self + + def resize(self, size, *args, **kwargs): + # density map doesn't need resize + return self + + +class Heatmap(object): + """ + This class is unfinished and not meant for use yet + It is supposed to contain the mask for an object as + a 2d tensor + """ + + def __init__(self, heatmap, count=0): + # (num_classes, height, width) + assert isinstance(heatmap, torch.Tensor) and heatmap.ndimension() == 3 + self.heatmap = heatmap + self.count = count + + def transpose(self, method): + if method not in (FLIP_LEFT_RIGHT, FLIP_TOP_BOTTOM): + raise NotImplementedError( + "Only FLIP_LEFT_RIGHT and FLIP_TOP_BOTTOM implemented" + ) + if method == FLIP_LEFT_RIGHT: + return Heatmap(torch.flip(self.heatmap, dims=(2,)), count=self.count) + elif method == FLIP_TOP_BOTTOM: + return Heatmap(torch.flip(self.heatmap, dims=(1,)), count=self.count) + + def to(self, *args, **kwargs): + heatmap = self.heatmap.to(*args, **kwargs) + return Heatmap(heatmap, count=self.count) + + def __getitem__(self, item): + return self + + def resize(self, *args, **kwargs): + return self diff --git a/maskrcnn_benchmark/utils/README.md b/maskrcnn_benchmark/utils/README.md new file mode 100644 index 0000000000000000000000000000000000000000..9765b24a730b77556104187ac3ef5439ab0859fd --- /dev/null +++ b/maskrcnn_benchmark/utils/README.md @@ -0,0 +1,5 @@ +# Utility functions + +This folder contain utility functions that are not used in the +core library, but are useful for building models or training +code using the config system. diff --git a/maskrcnn_benchmark/utils/__init__.py b/maskrcnn_benchmark/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/maskrcnn_benchmark/utils/c2_model_loading.py b/maskrcnn_benchmark/utils/c2_model_loading.py new file mode 100644 index 0000000000000000000000000000000000000000..b1b9996e8c20b7e947acd601251fbac0854701d7 --- /dev/null +++ b/maskrcnn_benchmark/utils/c2_model_loading.py @@ -0,0 +1,175 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import logging +import pickle +from collections import OrderedDict + +import torch + +from maskrcnn_benchmark.utils.model_serialization import load_state_dict +from maskrcnn_benchmark.utils.registry import Registry + + +def _rename_basic_resnet_weights(layer_keys): + layer_keys = [k.replace("_", ".") for k in layer_keys] + layer_keys = [k.replace(".w", ".weight") for k in layer_keys] + layer_keys = [k.replace(".bn", "_bn") for k in layer_keys] + layer_keys = [k.replace(".b", ".bias") for k in layer_keys] + layer_keys = [k.replace("_bn.s", "_bn.scale") for k in layer_keys] + layer_keys = [k.replace(".biasranch", ".branch") for k in layer_keys] + layer_keys = [k.replace("bbox.pred", "bbox_pred") for k in layer_keys] + layer_keys = [k.replace("cls.score", "cls_score") for k in layer_keys] + layer_keys = [k.replace("res.conv1_", "conv1_") for k in layer_keys] + + # RPN / Faster RCNN + layer_keys = [k.replace(".biasbox", ".bbox") for k in layer_keys] + layer_keys = [k.replace("conv.rpn", "rpn.conv") for k in layer_keys] + layer_keys = [k.replace("rpn.bbox.pred", "rpn.bbox_pred") for k in layer_keys] + layer_keys = [k.replace("rpn.cls.logits", "rpn.cls_logits") for k in layer_keys] + + # Affine-Channel -> BatchNorm enaming + layer_keys = [k.replace("_bn.scale", "_bn.weight") for k in layer_keys] + + # Make torchvision-compatible + layer_keys = [k.replace("conv1_bn.", "bn1.") for k in layer_keys] + + layer_keys = [k.replace("res2.", "layer1.") for k in layer_keys] + layer_keys = [k.replace("res3.", "layer2.") for k in layer_keys] + layer_keys = [k.replace("res4.", "layer3.") for k in layer_keys] + layer_keys = [k.replace("res5.", "layer4.") for k in layer_keys] + + layer_keys = [k.replace(".branch2a.", ".conv1.") for k in layer_keys] + layer_keys = [k.replace(".branch2a_bn.", ".bn1.") for k in layer_keys] + layer_keys = [k.replace(".branch2b.", ".conv2.") for k in layer_keys] + layer_keys = [k.replace(".branch2b_bn.", ".bn2.") for k in layer_keys] + layer_keys = [k.replace(".branch2c.", ".conv3.") for k in layer_keys] + layer_keys = [k.replace(".branch2c_bn.", ".bn3.") for k in layer_keys] + + layer_keys = [k.replace(".branch1.", ".downsample.0.") for k in layer_keys] + layer_keys = [k.replace(".branch1_bn.", ".downsample.1.") for k in layer_keys] + + # GroupNorm + layer_keys = [k.replace("conv1.gn.s", "bn1.weight") for k in layer_keys] + layer_keys = [k.replace("conv1.gn.bias", "bn1.bias") for k in layer_keys] + layer_keys = [k.replace("conv2.gn.s", "bn2.weight") for k in layer_keys] + layer_keys = [k.replace("conv2.gn.bias", "bn2.bias") for k in layer_keys] + layer_keys = [k.replace("conv3.gn.s", "bn3.weight") for k in layer_keys] + layer_keys = [k.replace("conv3.gn.bias", "bn3.bias") for k in layer_keys] + layer_keys = [k.replace("downsample.0.gn.s", "downsample.1.weight") \ + for k in layer_keys] + layer_keys = [k.replace("downsample.0.gn.bias", "downsample.1.bias") \ + for k in layer_keys] + + return layer_keys + +def _rename_fpn_weights(layer_keys, stage_names): + for mapped_idx, stage_name in enumerate(stage_names, 1): + suffix = "" + if mapped_idx < 4: + suffix = ".lateral" + layer_keys = [ + k.replace("fpn.inner.layer{}.sum{}".format(stage_name, suffix), "fpn_inner{}".format(mapped_idx)) for k in layer_keys + ] + layer_keys = [k.replace("fpn.layer{}.sum".format(stage_name), "fpn_layer{}".format(mapped_idx)) for k in layer_keys] + + + layer_keys = [k.replace("rpn.conv.fpn2", "rpn.conv") for k in layer_keys] + layer_keys = [k.replace("rpn.bbox_pred.fpn2", "rpn.bbox_pred") for k in layer_keys] + layer_keys = [ + k.replace("rpn.cls_logits.fpn2", "rpn.cls_logits") for k in layer_keys + ] + + return layer_keys + + +def _rename_weights_for_resnet(weights, stage_names): + original_keys = sorted(weights.keys()) + layer_keys = sorted(weights.keys()) + + # for X-101, rename output to fc1000 to avoid conflicts afterwards + layer_keys = [k if k != "pred_b" else "fc1000_b" for k in layer_keys] + layer_keys = [k if k != "pred_w" else "fc1000_w" for k in layer_keys] + + # performs basic renaming: _ -> . , etc + layer_keys = _rename_basic_resnet_weights(layer_keys) + + # FPN + layer_keys = _rename_fpn_weights(layer_keys, stage_names) + + # Mask R-CNN + layer_keys = [k.replace("mask.fcn.logits", "mask_fcn_logits") for k in layer_keys] + layer_keys = [k.replace(".[mask].fcn", "mask_fcn") for k in layer_keys] + layer_keys = [k.replace("conv5.mask", "conv5_mask") for k in layer_keys] + + # Keypoint R-CNN + layer_keys = [k.replace("kps.score.lowres", "kps_score_lowres") for k in layer_keys] + layer_keys = [k.replace("kps.score", "kps_score") for k in layer_keys] + layer_keys = [k.replace("conv.fcn", "conv_fcn") for k in layer_keys] + + # Rename for our RPN structure + layer_keys = [k.replace("rpn.", "rpn.head.") for k in layer_keys] + + key_map = {k: v for k, v in zip(original_keys, layer_keys)} + + logger = logging.getLogger(__name__) + logger.info("Remapping C2 weights") + max_c2_key_size = max([len(k) for k in original_keys if "_momentum" not in k]) + + new_weights = OrderedDict() + for k in original_keys: + v = weights[k] + if "_momentum" in k: + continue + # if 'fc1000' in k: + # continue + w = torch.from_numpy(v) + # if "bn" in k: + # w = w.view(1, -1, 1, 1) + logger.info("C2 name: {: <{}} mapped name: {}".format(k, max_c2_key_size, key_map[k])) + new_weights[key_map[k]] = w + + return new_weights + + +def _load_c2_pickled_weights(file_path): + with open(file_path, "rb") as f: + if torch._six.PY3: + data = pickle.load(f, encoding="latin1") + else: + data = pickle.load(f) + if "blobs" in data: + weights = data["blobs"] + else: + weights = data + return weights + + +_C2_STAGE_NAMES = { + "R-50": ["1.2", "2.3", "3.5", "4.2"], + "R-101": ["1.2", "2.3", "3.22", "4.2"], + "R-152": ["1.2", "2.7", "3.35", "4.2"], +} + +C2_FORMAT_LOADER = Registry() + + +@C2_FORMAT_LOADER.register("R-50-C4") +@C2_FORMAT_LOADER.register("R-50-C5") +@C2_FORMAT_LOADER.register("R-101-C4") +@C2_FORMAT_LOADER.register("R-101-C5") +@C2_FORMAT_LOADER.register("R-50-FPN") +@C2_FORMAT_LOADER.register("R-50-FPN-RETINANET") +@C2_FORMAT_LOADER.register("R-101-FPN") +@C2_FORMAT_LOADER.register("R-101-FPN-RETINANET") +@C2_FORMAT_LOADER.register("R-152-FPN") +def load_resnet_c2_format(cfg, f): + state_dict = _load_c2_pickled_weights(f) + conv_body = cfg.MODEL.BACKBONE.CONV_BODY + arch = conv_body.replace("-C4", "").replace("-C5", "").replace("-FPN", "") + arch = arch.replace("-RETINANET", "") + stages = _C2_STAGE_NAMES[arch] + state_dict = _rename_weights_for_resnet(state_dict, stages) + return dict(model=state_dict) + + +def load_c2_format(cfg, f): + return C2_FORMAT_LOADER[cfg.MODEL.BACKBONE.CONV_BODY](cfg, f) diff --git a/maskrcnn_benchmark/utils/checkpoint.py b/maskrcnn_benchmark/utils/checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..dc403f5dbf291ded515c669bf5f4caaf82a73c8a --- /dev/null +++ b/maskrcnn_benchmark/utils/checkpoint.py @@ -0,0 +1,139 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import logging +import os + +import torch + +from maskrcnn_benchmark.utils.model_serialization import load_state_dict +from maskrcnn_benchmark.utils.c2_model_loading import load_c2_format +from maskrcnn_benchmark.utils.imports import import_file +from maskrcnn_benchmark.utils.model_zoo import cache_url + + +class Checkpointer(object): + def __init__( + self, + model, + optimizer=None, + scheduler=None, + save_dir="", + save_to_disk=None, + logger=None, + ): + self.model = model + self.optimizer = optimizer + self.scheduler = scheduler + self.save_dir = save_dir + self.save_to_disk = save_to_disk + if logger is None: + logger = logging.getLogger(__name__) + self.logger = logger + + def save(self, name, **kwargs): + if not self.save_dir: + return + + if not self.save_to_disk: + return + + data = {} + data["model"] = self.model.state_dict() + if self.optimizer is not None: + data["optimizer"] = self.optimizer.state_dict() + if self.scheduler is not None: + data["scheduler"] = self.scheduler.state_dict() + data.update(kwargs) + + save_file = os.path.join(self.save_dir, "{}.pth".format(name)) + self.logger.info("Saving checkpoint to {}".format(save_file)) + torch.save(data, save_file) + self.tag_last_checkpoint(save_file) + + def load(self, f=None): + if self.has_checkpoint(): + # override argument with existing checkpoint + f = self.get_checkpoint_file() + if not f: + # no checkpoint could be found + self.logger.info("No checkpoint found. Initializing model from scratch") + return {} + self.logger.info("Loading checkpoint from {}".format(f)) + checkpoint = self._load_file(f) + self._load_model(checkpoint) + if "optimizer" in checkpoint and self.optimizer: + self.logger.info("Loading optimizer from {}".format(f)) + self.optimizer.load_state_dict(checkpoint.pop("optimizer")) + if "scheduler" in checkpoint and self.scheduler: + self.logger.info("Loading scheduler from {}".format(f)) + self.scheduler.load_state_dict(checkpoint.pop("scheduler")) + + # return any further checkpoint data + return checkpoint + + def has_checkpoint(self): + save_file = os.path.join(self.save_dir, "last_checkpoint") + return os.path.exists(save_file) + + def get_checkpoint_file(self): + save_file = os.path.join(self.save_dir, "last_checkpoint") + try: + with open(save_file, "r") as f: + last_saved = f.read() + last_saved = last_saved.strip() + except IOError: + # if file doesn't exist, maybe because it has just been + # deleted by a separate process + last_saved = "" + return last_saved + + def tag_last_checkpoint(self, last_filename): + save_file = os.path.join(self.save_dir, "last_checkpoint") + with open(save_file, "w") as f: + f.write(last_filename) + + def _load_file(self, f): + return torch.load(f, map_location=torch.device("cpu")) + + def _load_model(self, checkpoint): + load_state_dict(self.model, checkpoint.pop("model")) + + +class DetectronCheckpointer(Checkpointer): + def __init__( + self, + cfg, + model, + optimizer=None, + scheduler=None, + save_dir="", + save_to_disk=None, + logger=None, + ): + super(DetectronCheckpointer, self).__init__( + model, optimizer, scheduler, save_dir, save_to_disk, logger + ) + self.cfg = cfg.clone() + + def _load_file(self, f): + # catalog lookup + if f.startswith("catalog://"): + paths_catalog = import_file( + "maskrcnn_benchmark.config.paths_catalog", self.cfg.PATHS_CATALOG, True + ) + catalog_f = paths_catalog.ModelCatalog.get(f[len("catalog://") :]) + self.logger.info("{} points to {}".format(f, catalog_f)) + f = catalog_f + # download url files + if f.startswith("http"): + # if the file is a url path, download it and cache it + cached_f = cache_url(f) + self.logger.info("url {} cached in {}".format(f, cached_f)) + f = cached_f + # convert Caffe2 checkpoint from pkl + if f.endswith(".pkl"): + return load_c2_format(self.cfg, f) + # load native detectron.pytorch checkpoint + loaded = super(DetectronCheckpointer, self)._load_file(f) + if "model" not in loaded: + loaded = dict(model=loaded) + return loaded diff --git a/maskrcnn_benchmark/utils/collect_env.py b/maskrcnn_benchmark/utils/collect_env.py new file mode 100644 index 0000000000000000000000000000000000000000..2d0641dda61c9950cb54d0552106246248e571ef --- /dev/null +++ b/maskrcnn_benchmark/utils/collect_env.py @@ -0,0 +1,14 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import PIL + +from torch.utils.collect_env import get_pretty_env_info + + +def get_pil_version(): + return "\n Pillow ({})".format(PIL.__version__) + + +def collect_env_info(): + env_str = get_pretty_env_info() + env_str += get_pil_version() + return env_str diff --git a/maskrcnn_benchmark/utils/comm.py b/maskrcnn_benchmark/utils/comm.py new file mode 100644 index 0000000000000000000000000000000000000000..46d7c55ce04b4180def3909cd0989c21e544085f --- /dev/null +++ b/maskrcnn_benchmark/utils/comm.py @@ -0,0 +1,117 @@ +""" +This file contains primitives for multi-gpu communication. +This is useful when doing distributed training. +""" + +import pickle +import time + +import torch +import torch.distributed as dist + + +def get_world_size(): + if not dist.is_available(): + return 1 + if not dist.is_initialized(): + return 1 + return dist.get_world_size() + + +def get_rank(): + if not dist.is_available(): + return 0 + if not dist.is_initialized(): + return 0 + return dist.get_rank() + + +def is_main_process(): + return get_rank() == 0 + + +def synchronize(): + """ + Helper function to synchronize (barrier) among all processes when + using distributed training + """ + if not dist.is_available(): + return + if not dist.is_initialized(): + return + world_size = dist.get_world_size() + if world_size == 1: + return + dist.barrier() + + +def all_gather(data): + """ + Run all_gather on arbitrary picklable data (not necessarily tensors) + Args: + data: any picklable object + Returns: + list[data]: list of data gathered from each rank + """ + world_size = get_world_size() + if world_size == 1: + return [data] + + # serialized to a Tensor + buffer = pickle.dumps(data) + storage = torch.ByteStorage.from_buffer(buffer) + tensor = torch.ByteTensor(storage).to("cuda") + + # obtain Tensor size of each rank + local_size = torch.IntTensor([tensor.numel()]).to("cuda") + size_list = [torch.IntTensor([0]).to("cuda") for _ in range(world_size)] + dist.all_gather(size_list, local_size) + size_list = [int(size.item()) for size in size_list] + max_size = max(size_list) + + # receiving Tensor from all ranks + # we pad the tensor because torch all_gather does not support + # gathering tensors of different shapes + tensor_list = [] + for _ in size_list: + tensor_list.append(torch.ByteTensor(size=(max_size,)).to("cuda")) + if local_size != max_size: + padding = torch.ByteTensor(size=(max_size - local_size,)).to("cuda") + tensor = torch.cat((tensor, padding), dim=0) + dist.all_gather(tensor_list, tensor) + + data_list = [] + for size, tensor in zip(size_list, tensor_list): + buffer = tensor.cpu().numpy().tobytes()[:size] + data_list.append(pickle.loads(buffer)) + + return data_list + + +def reduce_dict(input_dict, average=True): + """ + Args: + input_dict (dict): all the values will be reduced + average (bool): whether to do average or sum + Reduce the values in the dictionary from all processes so that process with rank + 0 has the averaged results. Returns a dict with the same fields as + input_dict, after reduction. + """ + world_size = get_world_size() + if world_size < 2: + return input_dict + with torch.no_grad(): + names = [] + values = [] + # sort the keys so that they are consistent across processes + for k in sorted(input_dict.keys()): + names.append(k) + values.append(input_dict[k]) + values = torch.stack(values, dim=0) + dist.reduce(values, dst=0) + if dist.get_rank() == 0 and average: + # only main process gets accumulated, so only divide by + # world_size in this case + values /= world_size + reduced_dict = {k: v for k, v in zip(names, values)} + return reduced_dict diff --git a/maskrcnn_benchmark/utils/cv2_util.py b/maskrcnn_benchmark/utils/cv2_util.py new file mode 100644 index 0000000000000000000000000000000000000000..0bbc0fb2d08337bfd8242cbedd514a41d8d7353f --- /dev/null +++ b/maskrcnn_benchmark/utils/cv2_util.py @@ -0,0 +1,24 @@ +""" +Module for cv2 utility functions and maintaining version compatibility +between 3.x and 4.x +""" +import cv2 + + +def findContours(*args, **kwargs): + """ + Wraps cv2.findContours to maintain compatiblity between versions + 3 and 4 + + Returns: + contours, hierarchy + """ + if cv2.__version__.startswith('4'): + contours, hierarchy = cv2.findContours(*args, **kwargs) + elif cv2.__version__.startswith('3'): + _, contours, hierarchy = cv2.findContours(*args, **kwargs) + else: + raise AssertionError( + 'cv2 must be either version 3 or 4 to call this method') + + return contours, hierarchy diff --git a/maskrcnn_benchmark/utils/density.py b/maskrcnn_benchmark/utils/density.py new file mode 100644 index 0000000000000000000000000000000000000000..dd77d42c8645213dbda4d56184f47aece8487e16 --- /dev/null +++ b/maskrcnn_benchmark/utils/density.py @@ -0,0 +1,68 @@ +import numpy as np +from scipy import ndimage + +_categories = (-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 6, 6, 6, 6, 6, 6, 6, 6, + 7, 7, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, + 16, 16, 16, 16, 16, 16, 16) + +RPC_SUPPORT_CATEGORIES = (1, 17, 200) + +_coco_categories = ( + -1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 7, + 7, + 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11) +COCO_SUPPORT_CATEGORIES = (1, 12, 80) + + +def contiguous_coco_category_to_super_category(category_id, num_classes): + cat_id = -1 + assert num_classes in COCO_SUPPORT_CATEGORIES, 'Not support {} density categories'.format(num_classes) + if num_classes == 12: + cat_id = _coco_categories[category_id] + elif num_classes == 1: + cat_id = 0 + elif num_classes == 80: + cat_id = category_id - 1 + assert 79 >= cat_id >= 0 + return cat_id + + +def rpc_category_to_super_category(category_id, num_classes): + """Map category to super-category id + Args: + category_id: list of category ids, 1-based + num_classes: 1, 17, 200 + Returns: + super-category id, 0-based + """ + cat_id = -1 + assert num_classes in RPC_SUPPORT_CATEGORIES, 'Not support {} density categories'.format(num_classes) + if num_classes == 17: + cat_id = _categories[category_id] + elif num_classes == 1: + cat_id = 0 + elif num_classes == 200: + cat_id = category_id - 1 + assert 199 >= cat_id >= 0 + return cat_id + + +def generate_density_map(labels, boxes, scale=50.0 / 800, size=50, num_classes=200, min_sigma=1): + density_map = np.zeros((num_classes, size, size), dtype=np.float32) + for category, box in zip(labels, boxes): + x1, y1, x2, y2 = [x * scale for x in box] + w, h = x2 - x1, y2 - y1 + box_radius = min(w, h) / 2 + sigma = max(min_sigma, box_radius * 5 / (4 * 3)) # 3/5 of gaussian kernel is in box + cx, cy = round((x1 + x2) / 2), round((y1 + y2) / 2) + density = np.zeros((size, size), dtype=np.float32) + density[cy, cx] = 1 + density = ndimage.filters.gaussian_filter(density, sigma, mode='constant') + density_map[category, :, :] += density + + return density_map diff --git a/maskrcnn_benchmark/utils/env.py b/maskrcnn_benchmark/utils/env.py new file mode 100644 index 0000000000000000000000000000000000000000..1c7db32e41ec266ead9734f90d0173b4feff61ef --- /dev/null +++ b/maskrcnn_benchmark/utils/env.py @@ -0,0 +1,37 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import os + +from maskrcnn_benchmark.utils.imports import import_file + + +def setup_environment(): + """Perform environment setup work. The default setup is a no-op, but this + function allows the user to specify a Python source file that performs + custom setup work that may be necessary to their computing environment. + """ + custom_module_path = os.environ.get("TORCH_DETECTRON_ENV_MODULE") + if custom_module_path: + setup_custom_environment(custom_module_path) + else: + # The default setup is a no-op + pass + + +def setup_custom_environment(custom_module_path): + """Load custom environment setup from a Python source file and run the setup + function. + """ + module = import_file("maskrcnn_benchmark.utils.env.custom_module", custom_module_path) + assert hasattr(module, "setup_environment") and callable( + module.setup_environment + ), ( + "Custom environment module defined in {} does not have the " + "required callable attribute 'setup_environment'." + ).format( + custom_module_path + ) + module.setup_environment() + + +# Force environment setup when this module is imported +setup_environment() diff --git a/maskrcnn_benchmark/utils/imports.py b/maskrcnn_benchmark/utils/imports.py new file mode 100644 index 0000000000000000000000000000000000000000..53e27e2bcfd6d9dd57579f48d42811072daf0df5 --- /dev/null +++ b/maskrcnn_benchmark/utils/imports.py @@ -0,0 +1,23 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import torch + +if torch._six.PY3: + import importlib + import importlib.util + import sys + + + # from https://stackoverflow.com/questions/67631/how-to-import-a-module-given-the-full-path?utm_medium=organic&utm_source=google_rich_qa&utm_campaign=google_rich_qa + def import_file(module_name, file_path, make_importable=False): + spec = importlib.util.spec_from_file_location(module_name, file_path) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + if make_importable: + sys.modules[module_name] = module + return module +else: + import imp + + def import_file(module_name, file_path, make_importable=None): + module = imp.load_source(module_name, file_path) + return module diff --git a/maskrcnn_benchmark/utils/logger.py b/maskrcnn_benchmark/utils/logger.py new file mode 100644 index 0000000000000000000000000000000000000000..13847a3a76b481e132190ee0757b3539fb8981ae --- /dev/null +++ b/maskrcnn_benchmark/utils/logger.py @@ -0,0 +1,25 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import logging +import os +import sys + + +def setup_logger(name, save_dir, distributed_rank, filename="log.txt"): + logger = logging.getLogger(name) + logger.setLevel(logging.DEBUG) + # don't log results for the non-master process + if distributed_rank > 0: + return logger + ch = logging.StreamHandler(stream=sys.stdout) + ch.setLevel(logging.DEBUG) + formatter = logging.Formatter("%(asctime)s %(name)s %(levelname)s: %(message)s") + ch.setFormatter(formatter) + logger.addHandler(ch) + + if save_dir: + fh = logging.FileHandler(os.path.join(save_dir, filename)) + fh.setLevel(logging.DEBUG) + fh.setFormatter(formatter) + logger.addHandler(fh) + + return logger diff --git a/maskrcnn_benchmark/utils/metric_logger.py b/maskrcnn_benchmark/utils/metric_logger.py new file mode 100644 index 0000000000000000000000000000000000000000..5e37a72ea4b4c85546de98210295a6adb134a297 --- /dev/null +++ b/maskrcnn_benchmark/utils/metric_logger.py @@ -0,0 +1,66 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +from collections import defaultdict +from collections import deque + +import torch + + +class SmoothedValue(object): + """Track a series of values and provide access to smoothed values over a + window or the global series average. + """ + + def __init__(self, window_size=20): + self.deque = deque(maxlen=window_size) + self.series = [] + self.total = 0.0 + self.count = 0 + + def update(self, value): + self.deque.append(value) + self.series.append(value) + self.count += 1 + self.total += value + + @property + def median(self): + d = torch.tensor(list(self.deque)) + return d.median().item() + + @property + def avg(self): + d = torch.tensor(list(self.deque)) + return d.mean().item() + + @property + def global_avg(self): + return self.total / self.count + + +class MetricLogger(object): + def __init__(self, delimiter="\t"): + self.meters = defaultdict(SmoothedValue) + self.delimiter = delimiter + + def update(self, **kwargs): + for k, v in kwargs.items(): + if isinstance(v, torch.Tensor): + v = v.item() + assert isinstance(v, (float, int)) + self.meters[k].update(v) + + def __getattr__(self, attr): + if attr in self.meters: + return self.meters[attr] + if attr in self.__dict__: + return self.__dict__[attr] + raise AttributeError("'{}' object has no attribute '{}'".format( + type(self).__name__, attr)) + + def __str__(self): + loss_str = [] + for name, meter in self.meters.items(): + loss_str.append( + "{}: {:.4f} ({:.4f})".format(name, meter.median, meter.global_avg) + ) + return self.delimiter.join(loss_str) diff --git a/maskrcnn_benchmark/utils/miscellaneous.py b/maskrcnn_benchmark/utils/miscellaneous.py new file mode 100644 index 0000000000000000000000000000000000000000..db9a8b3679ceea2a5cd2b807421793bbbd3d3677 --- /dev/null +++ b/maskrcnn_benchmark/utils/miscellaneous.py @@ -0,0 +1,11 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import errno +import os + + +def mkdir(path): + try: + os.makedirs(path) + except OSError as e: + if e.errno != errno.EEXIST: + raise diff --git a/maskrcnn_benchmark/utils/model_serialization.py b/maskrcnn_benchmark/utils/model_serialization.py new file mode 100644 index 0000000000000000000000000000000000000000..a95ad8b2a7a787d62dc3ea580b2dfd30e358da28 --- /dev/null +++ b/maskrcnn_benchmark/utils/model_serialization.py @@ -0,0 +1,80 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +from collections import OrderedDict +import logging + +import torch + +from maskrcnn_benchmark.utils.imports import import_file + + +def align_and_update_state_dicts(model_state_dict, loaded_state_dict): + """ + Strategy: suppose that the models that we will create will have prefixes appended + to each of its keys, for example due to an extra level of nesting that the original + pre-trained weights from ImageNet won't contain. For example, model.state_dict() + might return backbone[0].body.res2.conv1.weight, while the pre-trained model contains + res2.conv1.weight. We thus want to match both parameters together. + For that, we look for each model weight, look among all loaded keys if there is one + that is a suffix of the current weight name, and use it if that's the case. + If multiple matches exist, take the one with longest size + of the corresponding name. For example, for the same model as before, the pretrained + weight file can contain both res2.conv1.weight, as well as conv1.weight. In this case, + we want to match backbone[0].body.conv1.weight to conv1.weight, and + backbone[0].body.res2.conv1.weight to res2.conv1.weight. + """ + current_keys = sorted(list(model_state_dict.keys())) + loaded_keys = sorted(list(loaded_state_dict.keys())) + # get a matrix of string matches, where each (i, j) entry correspond to the size of the + # loaded_key string, if it matches + match_matrix = [ + len(j) if i.endswith(j) else 0 for i in current_keys for j in loaded_keys + ] + match_matrix = torch.as_tensor(match_matrix).view( + len(current_keys), len(loaded_keys) + ) + max_match_size, idxs = match_matrix.max(1) + # remove indices that correspond to no-match + idxs[max_match_size == 0] = -1 + + # used for logging + max_size = max([len(key) for key in current_keys]) if current_keys else 1 + max_size_loaded = max([len(key) for key in loaded_keys]) if loaded_keys else 1 + log_str_template = "{: <{}} loaded from {: <{}} of shape {}" + logger = logging.getLogger(__name__) + for idx_new, idx_old in enumerate(idxs.tolist()): + if idx_old == -1: + continue + key = current_keys[idx_new] + key_old = loaded_keys[idx_old] + model_state_dict[key] = loaded_state_dict[key_old] + logger.info( + log_str_template.format( + key, + max_size, + key_old, + max_size_loaded, + tuple(loaded_state_dict[key_old].shape), + ) + ) + + +def strip_prefix_if_present(state_dict, prefix): + keys = sorted(state_dict.keys()) + if not all(key.startswith(prefix) for key in keys): + return state_dict + stripped_state_dict = OrderedDict() + for key, value in state_dict.items(): + stripped_state_dict[key.replace(prefix, "")] = value + return stripped_state_dict + + +def load_state_dict(model, loaded_state_dict): + model_state_dict = model.state_dict() + # if the state_dict comes from a model that was wrapped in a + # DataParallel or DistributedDataParallel during serialization, + # remove the "module" prefix before performing the matching + loaded_state_dict = strip_prefix_if_present(loaded_state_dict, prefix="module.") + align_and_update_state_dicts(model_state_dict, loaded_state_dict) + + # use strict loading + model.load_state_dict(model_state_dict) diff --git a/maskrcnn_benchmark/utils/model_zoo.py b/maskrcnn_benchmark/utils/model_zoo.py new file mode 100644 index 0000000000000000000000000000000000000000..8aea289ebb31e69395d3bb90103c2ce17d0a8389 --- /dev/null +++ b/maskrcnn_benchmark/utils/model_zoo.py @@ -0,0 +1,61 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import os +import sys + +try: + from torch.utils.model_zoo import _download_url_to_file + from torch.utils.model_zoo import urlparse + from torch.utils.model_zoo import HASH_REGEX +except ImportError: + from torch.hub import _download_url_to_file + from torch.hub import urlparse + from torch.hub import HASH_REGEX + +from maskrcnn_benchmark.utils.comm import is_main_process +from maskrcnn_benchmark.utils.comm import synchronize + + +# very similar to https://github.com/pytorch/pytorch/blob/master/torch/utils/model_zoo.py +# but with a few improvements and modifications +def cache_url(url, model_dir=None, progress=True): + r"""Loads the Torch serialized object at the given URL. + If the object is already present in `model_dir`, it's deserialized and + returned. The filename part of the URL should follow the naming convention + ``filename-.ext`` where ```` is the first eight or more + digits of the SHA256 hash of the contents of the file. The hash is used to + ensure unique names and to verify the contents of the file. + The default value of `model_dir` is ``$TORCH_HOME/models`` where + ``$TORCH_HOME`` defaults to ``~/.torch``. The default directory can be + overridden with the ``$TORCH_MODEL_ZOO`` environment variable. + Args: + url (string): URL of the object to download + model_dir (string, optional): directory in which to save the object + progress (bool, optional): whether or not to display a progress bar to stderr + Example: + >>> cached_file = maskrcnn_benchmark.utils.model_zoo.cache_url('https://s3.amazonaws.com/pytorch/models/resnet18-5c106cde.pth') + """ + if model_dir is None: + torch_home = os.path.expanduser(os.getenv('TORCH_HOME', '~/.torch')) + model_dir = os.getenv('TORCH_MODEL_ZOO', os.path.join(torch_home, 'models')) + if not os.path.exists(model_dir): + os.makedirs(model_dir) + parts = urlparse(url) + filename = os.path.basename(parts.path) + if filename == "model_final.pkl": + # workaround as pre-trained Caffe2 models from Detectron have all the same filename + # so make the full path the filename by replacing / with _ + filename = parts.path.replace("/", "_") + cached_file = os.path.join(model_dir, filename) + if not os.path.exists(cached_file) and is_main_process(): + sys.stderr.write('Downloading: "{}" to {}\n'.format(url, cached_file)) + hash_prefix = HASH_REGEX.search(filename) + if hash_prefix is not None: + hash_prefix = hash_prefix.group(1) + # workaround: Caffe2 models don't have a hash, but follow the R-50 convention, + # which matches the hash PyTorch uses. So we skip the hash matching + # if the hash_prefix is less than 6 characters + if len(hash_prefix) < 6: + hash_prefix = None + _download_url_to_file(url, cached_file, hash_prefix, progress=progress) + synchronize() + return cached_file diff --git a/maskrcnn_benchmark/utils/registry.py b/maskrcnn_benchmark/utils/registry.py new file mode 100644 index 0000000000000000000000000000000000000000..c3204e14148fe3341307c5d24ba9154c07449511 --- /dev/null +++ b/maskrcnn_benchmark/utils/registry.py @@ -0,0 +1,45 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. + + +def _register_generic(module_dict, module_name, module): + assert module_name not in module_dict + module_dict[module_name] = module + + +class Registry(dict): + ''' + A helper class for managing registering modules, it extends a dictionary + and provides a register functions. + + Eg. creeting a registry: + some_registry = Registry({"default": default_module}) + + There're two ways of registering new modules: + 1): normal way is just calling register function: + def foo(): + ... + some_registry.register("foo_module", foo) + 2): used as decorator when declaring the module: + @some_registry.register("foo_module") + @some_registry.register("foo_modeul_nickname") + def foo(): + ... + + Access of module is just like using a dictionary, eg: + f = some_registry["foo_modeul"] + ''' + def __init__(self, *args, **kwargs): + super(Registry, self).__init__(*args, **kwargs) + + def register(self, module_name, module=None): + # used as function call + if module is not None: + _register_generic(self, module_name, module) + return + + # used as decorator + def register_fn(fn): + _register_generic(self, module_name, fn) + return fn + + return register_fn diff --git a/maskrcnn_benchmark/utils/timer.py b/maskrcnn_benchmark/utils/timer.py new file mode 100644 index 0000000000000000000000000000000000000000..935af1a30811abd81de29afd2cfec6cf6880cc5e --- /dev/null +++ b/maskrcnn_benchmark/utils/timer.py @@ -0,0 +1,46 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. + + +import time +import datetime + + +class Timer(object): + def __init__(self): + self.reset() + + @property + def average_time(self): + return self.total_time / self.calls if self.calls > 0 else 0.0 + + def tic(self): + # using time.time instead of time.clock because time time.clock + # does not normalize for multithreading + self.start_time = time.time() + + def toc(self, average=True): + self.add(time.time() - self.start_time) + if average: + return self.average_time + else: + return self.diff + + def add(self, time_diff): + self.diff = time_diff + self.total_time += self.diff + self.calls += 1 + + def reset(self): + self.total_time = 0.0 + self.calls = 0 + self.start_time = 0.0 + self.diff = 0.0 + + def avg_time_str(self): + time_str = str(datetime.timedelta(seconds=self.average_time)) + return time_str + + +def get_time_str(time_diff): + time_str = str(datetime.timedelta(seconds=time_diff)) + return time_str diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..a67b697bd543bc0648f92a63535180d18e870985 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +ninja +yacs +cython +matplotlib +tqdm diff --git a/setup.py b/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..bfb6845e5f60e510269ae2e283b028f96eab0a37 --- /dev/null +++ b/setup.py @@ -0,0 +1,69 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +#!/usr/bin/env python + +import glob +import os + +import torch +from setuptools import find_packages +from setuptools import setup +from torch.utils.cpp_extension import CUDA_HOME +from torch.utils.cpp_extension import CppExtension +from torch.utils.cpp_extension import CUDAExtension + +requirements = ["torch", "torchvision"] + + +def get_extensions(): + this_dir = os.path.dirname(os.path.abspath(__file__)) + extensions_dir = os.path.join(this_dir, "maskrcnn_benchmark", "csrc") + + main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) + source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp")) + source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu")) + + sources = main_file + source_cpu + extension = CppExtension + + extra_compile_args = {"cxx": []} + define_macros = [] + + if torch.cuda.is_available() and CUDA_HOME is not None: + extension = CUDAExtension + sources += source_cuda + define_macros += [("WITH_CUDA", None)] + extra_compile_args["nvcc"] = [ + "-DCUDA_HAS_FP16=1", + "-D__CUDA_NO_HALF_OPERATORS__", + "-D__CUDA_NO_HALF_CONVERSIONS__", + "-D__CUDA_NO_HALF2_OPERATORS__", + ] + + sources = [os.path.join(extensions_dir, s) for s in sources] + + include_dirs = [extensions_dir] + + ext_modules = [ + extension( + "maskrcnn_benchmark._C", + sources, + include_dirs=include_dirs, + define_macros=define_macros, + extra_compile_args=extra_compile_args, + ) + ] + + return ext_modules + + +setup( + name="maskrcnn_benchmark", + version="0.1", + author="fmassa", + url="https://github.com/facebookresearch/maskrcnn-benchmark", + description="object detection in pytorch", + packages=find_packages(exclude=("configs", "tests",)), + # install_requires=requirements, + ext_modules=get_extensions(), + cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension}, +) diff --git a/tests/checkpoint.py b/tests/checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..82004fb77e57d21cddff929ca0137b416a4af8b5 --- /dev/null +++ b/tests/checkpoint.py @@ -0,0 +1,118 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +from collections import OrderedDict +import os +from tempfile import TemporaryDirectory +import unittest + +import torch +from torch import nn + +from maskrcnn_benchmark.utils.model_serialization import load_state_dict +from maskrcnn_benchmark.utils.checkpoint import Checkpointer + + +class TestCheckpointer(unittest.TestCase): + def create_model(self): + return nn.Sequential(nn.Linear(2, 3), nn.Linear(3, 1)) + + def create_complex_model(self): + m = nn.Module() + m.block1 = nn.Module() + m.block1.layer1 = nn.Linear(2, 3) + m.layer2 = nn.Linear(3, 2) + m.res = nn.Module() + m.res.layer2 = nn.Linear(3, 2) + + state_dict = OrderedDict() + state_dict["layer1.weight"] = torch.rand(3, 2) + state_dict["layer1.bias"] = torch.rand(3) + state_dict["layer2.weight"] = torch.rand(2, 3) + state_dict["layer2.bias"] = torch.rand(2) + state_dict["res.layer2.weight"] = torch.rand(2, 3) + state_dict["res.layer2.bias"] = torch.rand(2) + + return m, state_dict + + def test_from_last_checkpoint_model(self): + # test that loading works even if they differ by a prefix + for trained_model, fresh_model in [ + (self.create_model(), self.create_model()), + (nn.DataParallel(self.create_model()), self.create_model()), + (self.create_model(), nn.DataParallel(self.create_model())), + ( + nn.DataParallel(self.create_model()), + nn.DataParallel(self.create_model()), + ), + ]: + + with TemporaryDirectory() as f: + checkpointer = Checkpointer( + trained_model, save_dir=f, save_to_disk=True + ) + checkpointer.save("checkpoint_file") + + # in the same folder + fresh_checkpointer = Checkpointer(fresh_model, save_dir=f) + self.assertTrue(fresh_checkpointer.has_checkpoint()) + self.assertEqual( + fresh_checkpointer.get_checkpoint_file(), + os.path.join(f, "checkpoint_file.pth"), + ) + _ = fresh_checkpointer.load() + + for trained_p, loaded_p in zip( + trained_model.parameters(), fresh_model.parameters() + ): + # different tensor references + self.assertFalse(id(trained_p) == id(loaded_p)) + # same content + self.assertTrue(trained_p.equal(loaded_p)) + + def test_from_name_file_model(self): + # test that loading works even if they differ by a prefix + for trained_model, fresh_model in [ + (self.create_model(), self.create_model()), + (nn.DataParallel(self.create_model()), self.create_model()), + (self.create_model(), nn.DataParallel(self.create_model())), + ( + nn.DataParallel(self.create_model()), + nn.DataParallel(self.create_model()), + ), + ]: + with TemporaryDirectory() as f: + checkpointer = Checkpointer( + trained_model, save_dir=f, save_to_disk=True + ) + checkpointer.save("checkpoint_file") + + # on different folders + with TemporaryDirectory() as g: + fresh_checkpointer = Checkpointer(fresh_model, save_dir=g) + self.assertFalse(fresh_checkpointer.has_checkpoint()) + self.assertEqual(fresh_checkpointer.get_checkpoint_file(), "") + _ = fresh_checkpointer.load(os.path.join(f, "checkpoint_file.pth")) + + for trained_p, loaded_p in zip( + trained_model.parameters(), fresh_model.parameters() + ): + # different tensor references + self.assertFalse(id(trained_p) == id(loaded_p)) + # same content + self.assertTrue(trained_p.equal(loaded_p)) + + def test_complex_model_loaded(self): + for add_data_parallel in [False, True]: + model, state_dict = self.create_complex_model() + if add_data_parallel: + model = nn.DataParallel(model) + + load_state_dict(model, state_dict) + for loaded, stored in zip(model.state_dict().values(), state_dict.values()): + # different tensor references + self.assertFalse(id(loaded) == id(stored)) + # same content + self.assertTrue(loaded.equal(stored)) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/env_tests/env.py b/tests/env_tests/env.py new file mode 100644 index 0000000000000000000000000000000000000000..26a974abfdd3ebda62e7281216e28d09b2ef8a8f --- /dev/null +++ b/tests/env_tests/env.py @@ -0,0 +1,12 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. + +import os + + +def get_config_root_path(): + ''' Path to configs for unit tests ''' + # cur_file_dir is root/tests/env_tests + cur_file_dir = os.path.dirname(os.path.abspath(os.path.realpath(__file__))) + ret = os.path.dirname(os.path.dirname(cur_file_dir)) + ret = os.path.join(ret, "configs") + return ret diff --git a/tests/test_backbones.py b/tests/test_backbones.py new file mode 100644 index 0000000000000000000000000000000000000000..2ee91695ce89e35a940dc65a06a64a5aaa05f261 --- /dev/null +++ b/tests/test_backbones.py @@ -0,0 +1,55 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. + +import unittest +import copy +import torch +# import modules to to register backbones +from maskrcnn_benchmark.modeling.backbone import build_backbone # NoQA +from maskrcnn_benchmark.modeling import registry +from maskrcnn_benchmark.config import cfg as g_cfg +from utils import load_config + + +# overwrite configs if specified, otherwise default config is used +BACKBONE_CFGS = { + "R-50-FPN": "e2e_faster_rcnn_R_50_FPN_1x.yaml", + "R-101-FPN": "e2e_faster_rcnn_R_101_FPN_1x.yaml", + "R-152-FPN": "e2e_faster_rcnn_R_101_FPN_1x.yaml", + "R-50-FPN-RETINANET": "retinanet/retinanet_R-50-FPN_1x.yaml", + "R-101-FPN-RETINANET": "retinanet/retinanet_R-101-FPN_1x.yaml", +} + + +class TestBackbones(unittest.TestCase): + def test_build_backbones(self): + ''' Make sure backbones run ''' + + self.assertGreater(len(registry.BACKBONES), 0) + + for name, backbone_builder in registry.BACKBONES.items(): + print('Testing {}...'.format(name)) + if name in BACKBONE_CFGS: + cfg = load_config(BACKBONE_CFGS[name]) + else: + # Use default config if config file is not specified + cfg = copy.deepcopy(g_cfg) + backbone = backbone_builder(cfg) + + # make sures the backbone has `out_channels` + self.assertIsNotNone( + getattr(backbone, 'out_channels', None), + 'Need to provide out_channels for backbone {}'.format(name) + ) + + N, C_in, H, W = 2, 3, 224, 256 + input = torch.rand([N, C_in, H, W], dtype=torch.float32) + out = backbone(input) + for cur_out in out: + self.assertEqual( + cur_out.shape[:2], + torch.Size([N, backbone.out_channels]) + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_box_coder.py b/tests/test_box_coder.py new file mode 100644 index 0000000000000000000000000000000000000000..ce39923a1a9cd1d3e22203e1b0b9c355a54dd09e --- /dev/null +++ b/tests/test_box_coder.py @@ -0,0 +1,109 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. + +import unittest + +import numpy as np +import torch +from maskrcnn_benchmark.modeling.box_coder import BoxCoder + + +class TestBoxCoder(unittest.TestCase): + def test_box_decoder(self): + """ Match unit test UtilsBoxesTest.TestBboxTransformRandom in + caffe2/operators/generate_proposals_op_util_boxes_test.cc + """ + box_coder = BoxCoder(weights=(1.0, 1.0, 1.0, 1.0)) + bbox = torch.from_numpy( + np.array( + [ + 175.62031555, + 20.91103172, + 253.352005, + 155.0145874, + 169.24636841, + 4.85241556, + 228.8605957, + 105.02092743, + 181.77426147, + 199.82876587, + 192.88427734, + 214.0255127, + 174.36262512, + 186.75761414, + 296.19091797, + 231.27906799, + 22.73153877, + 92.02596283, + 135.5695343, + 208.80291748, + ] + ) + .astype(np.float32) + .reshape(-1, 4) + ) + + deltas = torch.from_numpy( + np.array( + [ + 0.47861834, + 0.13992102, + 0.14961673, + 0.71495209, + 0.29915856, + -0.35664671, + 0.89018666, + 0.70815367, + -0.03852064, + 0.44466892, + 0.49492538, + 0.71409376, + 0.28052918, + 0.02184832, + 0.65289006, + 1.05060139, + -0.38172557, + -0.08533806, + -0.60335309, + 0.79052375, + ] + ) + .astype(np.float32) + .reshape(-1, 4) + ) + + gt_bbox = ( + np.array( + [ + 206.949539, + -30.715202, + 297.387665, + 244.448486, + 143.871216, + -83.342888, + 290.502289, + 121.053398, + 177.430283, + 198.666245, + 196.295273, + 228.703079, + 152.251892, + 145.431564, + 387.215454, + 274.594238, + 5.062420, + 11.040955, + 66.328903, + 269.686218, + ] + ) + .astype(np.float32) + .reshape(-1, 4) + ) + + results = box_coder.decode(deltas, bbox) + + np.testing.assert_allclose(results.detach().numpy(), gt_bbox, atol=1e-4) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_configs.py b/tests/test_configs.py new file mode 100644 index 0000000000000000000000000000000000000000..8d6350bef3811fdfbd803edbc780436f9cab735b --- /dev/null +++ b/tests/test_configs.py @@ -0,0 +1,24 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. + +import unittest +import glob +import os +import utils + + +class TestConfigs(unittest.TestCase): + def test_configs_load(self): + ''' Make sure configs are loadable ''' + + cfg_root_path = utils.get_config_root_path() + files = glob.glob( + os.path.join(cfg_root_path, "./**/*.yaml"), recursive=True) + self.assertGreater(len(files), 0) + + for fn in files: + print('Loading {}...'.format(fn)) + utils.load_config_from_file(fn) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_data_samplers.py b/tests/test_data_samplers.py new file mode 100644 index 0000000000000000000000000000000000000000..96338e1763ccad078da78f0ad49521005d023a97 --- /dev/null +++ b/tests/test_data_samplers.py @@ -0,0 +1,153 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import itertools +import random +import unittest + +from torch.utils.data.sampler import BatchSampler +from torch.utils.data.sampler import Sampler +from torch.utils.data.sampler import SequentialSampler +from torch.utils.data.sampler import RandomSampler + +from maskrcnn_benchmark.data.samplers import GroupedBatchSampler +from maskrcnn_benchmark.data.samplers import IterationBasedBatchSampler + + +class SubsetSampler(Sampler): + def __init__(self, indices): + self.indices = indices + + def __iter__(self): + return iter(self.indices) + + def __len__(self): + return len(self.indices) + + +class TestGroupedBatchSampler(unittest.TestCase): + def test_respect_order_simple(self): + drop_uneven = False + dataset = [i for i in range(40)] + group_ids = [i // 10 for i in dataset] + sampler = SequentialSampler(dataset) + for batch_size in [1, 3, 5, 6]: + batch_sampler = GroupedBatchSampler( + sampler, group_ids, batch_size, drop_uneven + ) + result = list(batch_sampler) + merged_result = list(itertools.chain.from_iterable(result)) + self.assertEqual(merged_result, dataset) + + def test_respect_order(self): + drop_uneven = False + dataset = [i for i in range(10)] + group_ids = [0, 0, 1, 0, 1, 1, 0, 1, 1, 0] + sampler = SequentialSampler(dataset) + + expected = [ + [[0], [1], [2], [3], [4], [5], [6], [7], [8], [9]], + [[0, 1, 3], [2, 4, 5], [6, 9], [7, 8]], + [[0, 1, 3, 6], [2, 4, 5, 7], [8], [9]], + ] + + for idx, batch_size in enumerate([1, 3, 4]): + batch_sampler = GroupedBatchSampler( + sampler, group_ids, batch_size, drop_uneven + ) + result = list(batch_sampler) + self.assertEqual(result, expected[idx]) + + def test_respect_order_drop_uneven(self): + batch_size = 3 + drop_uneven = True + dataset = [i for i in range(10)] + group_ids = [0, 0, 1, 0, 1, 1, 0, 1, 1, 0] + sampler = SequentialSampler(dataset) + batch_sampler = GroupedBatchSampler(sampler, group_ids, batch_size, drop_uneven) + + result = list(batch_sampler) + + expected = [[0, 1, 3], [2, 4, 5]] + self.assertEqual(result, expected) + + def test_subset_sampler(self): + batch_size = 3 + drop_uneven = False + dataset = [i for i in range(10)] + group_ids = [0, 0, 1, 0, 1, 1, 0, 1, 1, 0] + sampler = SubsetSampler([0, 3, 5, 6, 7, 8]) + + batch_sampler = GroupedBatchSampler(sampler, group_ids, batch_size, drop_uneven) + result = list(batch_sampler) + + expected = [[0, 3, 6], [5, 7, 8]] + self.assertEqual(result, expected) + + def test_permute_subset_sampler(self): + batch_size = 3 + drop_uneven = False + dataset = [i for i in range(10)] + group_ids = [0, 0, 1, 0, 1, 1, 0, 1, 1, 0] + sampler = SubsetSampler([5, 0, 6, 1, 3, 8]) + + batch_sampler = GroupedBatchSampler(sampler, group_ids, batch_size, drop_uneven) + result = list(batch_sampler) + + expected = [[5, 8], [0, 6, 1], [3]] + self.assertEqual(result, expected) + + def test_permute_subset_sampler_drop_uneven(self): + batch_size = 3 + drop_uneven = True + dataset = [i for i in range(10)] + group_ids = [0, 0, 1, 0, 1, 1, 0, 1, 1, 0] + sampler = SubsetSampler([5, 0, 6, 1, 3, 8]) + + batch_sampler = GroupedBatchSampler(sampler, group_ids, batch_size, drop_uneven) + result = list(batch_sampler) + + expected = [[0, 6, 1]] + self.assertEqual(result, expected) + + def test_len(self): + batch_size = 3 + drop_uneven = True + dataset = [i for i in range(10)] + group_ids = [random.randint(0, 1) for _ in dataset] + sampler = RandomSampler(dataset) + + batch_sampler = GroupedBatchSampler(sampler, group_ids, batch_size, drop_uneven) + result = list(batch_sampler) + self.assertEqual(len(result), len(batch_sampler)) + self.assertEqual(len(result), len(batch_sampler)) + + batch_sampler = GroupedBatchSampler(sampler, group_ids, batch_size, drop_uneven) + batch_sampler_len = len(batch_sampler) + result = list(batch_sampler) + self.assertEqual(len(result), batch_sampler_len) + self.assertEqual(len(result), len(batch_sampler)) + + +class TestIterationBasedBatchSampler(unittest.TestCase): + def test_number_of_iters_and_elements(self): + for batch_size in [2, 3, 4]: + for num_iterations in [4, 10, 20]: + for drop_last in [False, True]: + dataset = [i for i in range(10)] + sampler = SequentialSampler(dataset) + batch_sampler = BatchSampler( + sampler, batch_size, drop_last=drop_last + ) + + iter_sampler = IterationBasedBatchSampler( + batch_sampler, num_iterations + ) + assert len(iter_sampler) == num_iterations + for i, batch in enumerate(iter_sampler): + start = (i % len(batch_sampler)) * batch_size + end = min(start + batch_size, len(dataset)) + expected = [x for x in range(start, end)] + self.assertEqual(batch, expected) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_detectors.py b/tests/test_detectors.py new file mode 100644 index 0000000000000000000000000000000000000000..5f9f7bfa27a5aa1e063e98b47c158d74a2b06ba1 --- /dev/null +++ b/tests/test_detectors.py @@ -0,0 +1,143 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. + +import unittest +import glob +import os +import copy +import torch +from maskrcnn_benchmark.modeling.detector import build_detection_model +from maskrcnn_benchmark.structures.image_list import to_image_list +import utils + + +CONFIG_FILES = [ + # bbox + "e2e_faster_rcnn_R_50_C4_1x.yaml", + "e2e_faster_rcnn_R_50_FPN_1x.yaml", + "e2e_faster_rcnn_fbnet.yaml", + + # mask + "e2e_mask_rcnn_R_50_C4_1x.yaml", + "e2e_mask_rcnn_R_50_FPN_1x.yaml", + "e2e_mask_rcnn_fbnet.yaml", + + # keypoints + # TODO: fail to run for random model due to empty head input + # "e2e_keypoint_rcnn_R_50_FPN_1x.yaml", + + # gn + "gn_baselines/e2e_faster_rcnn_R_50_FPN_1x_gn.yaml", + # TODO: fail to run for random model due to empty head input + # "gn_baselines/e2e_mask_rcnn_R_50_FPN_Xconv1fc_1x_gn.yaml", + + # retinanet + "retinanet/retinanet_R-50-FPN_1x.yaml", + + # rpn only + "rpn_R_50_C4_1x.yaml", + "rpn_R_50_FPN_1x.yaml", +] + +EXCLUDED_FOLDERS = [ + "caffe2", + "quick_schedules", + "pascal_voc", + "cityscapes", +] + + +TEST_CUDA = torch.cuda.is_available() + + +def get_config_files(file_list, exclude_folders): + cfg_root_path = utils.get_config_root_path() + if file_list is not None: + files = [os.path.join(cfg_root_path, x) for x in file_list] + else: + files = glob.glob( + os.path.join(cfg_root_path, "./**/*.yaml"), recursive=True) + + def _contains(path, exclude_dirs): + return any(x in path for x in exclude_dirs) + + if exclude_folders is not None: + files = [x for x in files if not _contains(x, exclude_folders)] + + return files + + +def create_model(cfg, device): + cfg = copy.deepcopy(cfg) + cfg.freeze() + model = build_detection_model(cfg) + model = model.to(device) + return model + + +def create_random_input(cfg, device): + ret = [] + for x in cfg.INPUT.MIN_SIZE_TRAIN: + ret.append(torch.rand(3, x, int(x * 1.2))) + ret = to_image_list(ret, cfg.DATALOADER.SIZE_DIVISIBILITY) + ret = ret.to(device) + return ret + + +def _test_build_detectors(self, device): + ''' Make sure models build ''' + + cfg_files = get_config_files(None, EXCLUDED_FOLDERS) + self.assertGreater(len(cfg_files), 0) + + for cfg_file in cfg_files: + with self.subTest(cfg_file=cfg_file): + print('Testing {}...'.format(cfg_file)) + cfg = utils.load_config_from_file(cfg_file) + create_model(cfg, device) + + +def _test_run_selected_detectors(self, cfg_files, device): + ''' Make sure models build and run ''' + self.assertGreater(len(cfg_files), 0) + + for cfg_file in cfg_files: + with self.subTest(cfg_file=cfg_file): + print('Testing {}...'.format(cfg_file)) + cfg = utils.load_config_from_file(cfg_file) + cfg.MODEL.RPN.POST_NMS_TOP_N_TEST = 10 + cfg.MODEL.RPN.FPN_POST_NMS_TOP_N_TEST = 10 + model = create_model(cfg, device) + inputs = create_random_input(cfg, device) + model.eval() + output = model(inputs) + self.assertEqual(len(output), len(inputs.image_sizes)) + + +class TestDetectors(unittest.TestCase): + def test_build_detectors(self): + ''' Make sure models build ''' + _test_build_detectors(self, "cpu") + + @unittest.skipIf(not TEST_CUDA, "no CUDA detected") + def test_build_detectors_cuda(self): + ''' Make sure models build on gpu''' + _test_build_detectors(self, "cuda") + + def test_run_selected_detectors(self): + ''' Make sure models build and run ''' + # run on selected models + cfg_files = get_config_files(CONFIG_FILES, None) + # cfg_files = get_config_files(None, EXCLUDED_FOLDERS) + _test_run_selected_detectors(self, cfg_files, "cpu") + + @unittest.skipIf(not TEST_CUDA, "no CUDA detected") + def test_run_selected_detectors_cuda(self): + ''' Make sure models build and run on cuda ''' + # run on selected models + cfg_files = get_config_files(CONFIG_FILES, None) + # cfg_files = get_config_files(None, EXCLUDED_FOLDERS) + _test_run_selected_detectors(self, cfg_files, "cuda") + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_fbnet.py b/tests/test_fbnet.py new file mode 100644 index 0000000000000000000000000000000000000000..55eaabccc6ca0920060b547924b8c5b5063bb8f5 --- /dev/null +++ b/tests/test_fbnet.py @@ -0,0 +1,84 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. + +import unittest + +import numpy as np +import torch +import maskrcnn_benchmark.modeling.backbone.fbnet_builder as fbnet_builder + + +TEST_CUDA = torch.cuda.is_available() + + +def _test_primitive(self, device, op_name, op_func, N, C_in, C_out, expand, stride): + op = op_func(C_in, C_out, expand, stride).to(device) + input = torch.rand([N, C_in, 7, 7], dtype=torch.float32).to(device) + output = op(input) + self.assertEqual( + output.shape[:2], torch.Size([N, C_out]), + 'Primitive {} failed for shape {}.'.format(op_name, input.shape) + ) + + +class TestFBNetBuilder(unittest.TestCase): + def test_identity(self): + id_op = fbnet_builder.Identity(20, 20, 1) + input = torch.rand([10, 20, 7, 7], dtype=torch.float32) + output = id_op(input) + np.testing.assert_array_equal(np.array(input), np.array(output)) + + id_op = fbnet_builder.Identity(20, 40, 2) + input = torch.rand([10, 20, 7, 7], dtype=torch.float32) + output = id_op(input) + np.testing.assert_array_equal(output.shape, [10, 40, 4, 4]) + + def test_primitives(self): + ''' Make sures the primitives runs ''' + for op_name, op_func in fbnet_builder.PRIMITIVES.items(): + print('Testing {}'.format(op_name)) + + _test_primitive( + self, "cpu", + op_name, op_func, + N=20, C_in=16, C_out=32, expand=4, stride=1 + ) + + @unittest.skipIf(not TEST_CUDA, "no CUDA detected") + def test_primitives_cuda(self): + ''' Make sures the primitives runs on cuda ''' + for op_name, op_func in fbnet_builder.PRIMITIVES.items(): + print('Testing {}'.format(op_name)) + + _test_primitive( + self, "cuda", + op_name, op_func, + N=20, C_in=16, C_out=32, expand=4, stride=1 + ) + + def test_primitives_empty_batch(self): + ''' Make sures the primitives runs ''' + for op_name, op_func in fbnet_builder.PRIMITIVES.items(): + print('Testing {}'.format(op_name)) + + # test empty batch size + _test_primitive( + self, "cpu", + op_name, op_func, + N=0, C_in=16, C_out=32, expand=4, stride=1 + ) + + @unittest.skipIf(not TEST_CUDA, "no CUDA detected") + def test_primitives_cuda_empty_batch(self): + ''' Make sures the primitives runs ''' + for op_name, op_func in fbnet_builder.PRIMITIVES.items(): + print('Testing {}'.format(op_name)) + + # test empty batch size + _test_primitive( + self, "cuda", + op_name, op_func, + N=0, C_in=16, C_out=32, expand=4, stride=1 + ) + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_feature_extractors.py b/tests/test_feature_extractors.py new file mode 100644 index 0000000000000000000000000000000000000000..e14388bfb0af9c6c71c4d538cabd01497d7d5209 --- /dev/null +++ b/tests/test_feature_extractors.py @@ -0,0 +1,93 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. + +import unittest +import copy +import torch +# import modules to to register feature extractors +from maskrcnn_benchmark.modeling.backbone import build_backbone # NoQA +from maskrcnn_benchmark.modeling.roi_heads.roi_heads import build_roi_heads # NoQA +from maskrcnn_benchmark.modeling import registry +from maskrcnn_benchmark.structures.bounding_box import BoxList +from maskrcnn_benchmark.config import cfg as g_cfg +from utils import load_config + +# overwrite configs if specified, otherwise default config is used +FEATURE_EXTRACTORS_CFGS = { +} + +# overwrite configs if specified, otherwise default config is used +FEATURE_EXTRACTORS_INPUT_CHANNELS = { + # in_channels was not used, load through config + "ResNet50Conv5ROIFeatureExtractor": 1024, +} + + +def _test_feature_extractors( + self, extractors, overwrite_cfgs, overwrite_in_channels +): + ''' Make sure roi box feature extractors run ''' + + self.assertGreater(len(extractors), 0) + + in_channels_default = 64 + + for name, builder in extractors.items(): + print('Testing {}...'.format(name)) + if name in overwrite_cfgs: + cfg = load_config(overwrite_cfgs[name]) + else: + # Use default config if config file is not specified + cfg = copy.deepcopy(g_cfg) + + in_channels = overwrite_in_channels.get( + name, in_channels_default) + + fe = builder(cfg, in_channels) + self.assertIsNotNone( + getattr(fe, 'out_channels', None), + 'Need to provide out_channels for feature extractor {}'.format(name) + ) + + N, C_in, H, W = 2, in_channels, 24, 32 + input = torch.rand([N, C_in, H, W], dtype=torch.float32) + bboxes = [[1, 1, 10, 10], [5, 5, 8, 8], [2, 2, 3, 4]] + img_size = [384, 512] + box_list = BoxList(bboxes, img_size, "xyxy") + out = fe([input], [box_list] * N) + self.assertEqual( + out.shape[:2], + torch.Size([N * len(bboxes), fe.out_channels]) + ) + + +class TestFeatureExtractors(unittest.TestCase): + def test_roi_box_feature_extractors(self): + ''' Make sure roi box feature extractors run ''' + _test_feature_extractors( + self, + registry.ROI_BOX_FEATURE_EXTRACTORS, + FEATURE_EXTRACTORS_CFGS, + FEATURE_EXTRACTORS_INPUT_CHANNELS, + ) + + def test_roi_keypoints_feature_extractors(self): + ''' Make sure roi keypoints feature extractors run ''' + _test_feature_extractors( + self, + registry.ROI_KEYPOINT_FEATURE_EXTRACTORS, + FEATURE_EXTRACTORS_CFGS, + FEATURE_EXTRACTORS_INPUT_CHANNELS, + ) + + def test_roi_mask_feature_extractors(self): + ''' Make sure roi mask feature extractors run ''' + _test_feature_extractors( + self, + registry.ROI_MASK_FEATURE_EXTRACTORS, + FEATURE_EXTRACTORS_CFGS, + FEATURE_EXTRACTORS_INPUT_CHANNELS, + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_metric_logger.py b/tests/test_metric_logger.py new file mode 100644 index 0000000000000000000000000000000000000000..a5b884c908103f4cbdf364b94f4894083cff420b --- /dev/null +++ b/tests/test_metric_logger.py @@ -0,0 +1,28 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import unittest + +from maskrcnn_benchmark.utils.metric_logger import MetricLogger + + +class TestMetricLogger(unittest.TestCase): + def test_update(self): + meter = MetricLogger() + for i in range(10): + meter.update(metric=float(i)) + + m = meter.meters["metric"] + self.assertEqual(m.count, 10) + self.assertEqual(m.total, 45) + self.assertEqual(m.median, 4) + self.assertEqual(m.avg, 4.5) + + def test_no_attr(self): + meter = MetricLogger() + _ = meter.meters + _ = meter.delimiter + def broken(): + _ = meter.not_existent + self.assertRaises(AttributeError, broken) + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_nms.py b/tests/test_nms.py new file mode 100644 index 0000000000000000000000000000000000000000..1ed7dfd2f258b0ec759480d6b6fb54bad70cfd22 --- /dev/null +++ b/tests/test_nms.py @@ -0,0 +1,221 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. + +import unittest + +import numpy as np +import torch +from maskrcnn_benchmark.layers import nms as box_nms + + +class TestNMS(unittest.TestCase): + def test_nms_cpu(self): + """ Match unit test UtilsNMSTest.TestNMS in + caffe2/operators/generate_proposals_op_util_nms_test.cc + """ + + inputs = ( + np.array( + [ + 10, + 10, + 50, + 60, + 0.5, + 11, + 12, + 48, + 60, + 0.7, + 8, + 9, + 40, + 50, + 0.6, + 100, + 100, + 150, + 140, + 0.9, + 99, + 110, + 155, + 139, + 0.8, + ] + ) + .astype(np.float32) + .reshape(-1, 5) + ) + + boxes = torch.from_numpy(inputs[:, :4]) + scores = torch.from_numpy(inputs[:, 4]) + test_thresh = [0.1, 0.3, 0.5, 0.8, 0.9] + gt_indices = [[1, 3], [1, 3], [1, 3], [1, 2, 3, 4], [0, 1, 2, 3, 4]] + + for thresh, gt_index in zip(test_thresh, gt_indices): + keep_indices = box_nms(boxes, scores, thresh) + keep_indices = np.sort(keep_indices) + np.testing.assert_array_equal(keep_indices, np.array(gt_index)) + + def test_nms1_cpu(self): + """ Match unit test UtilsNMSTest.TestNMS1 in + caffe2/operators/generate_proposals_op_util_nms_test.cc + """ + + boxes = torch.from_numpy( + np.array( + [ + [350.9821, 161.8200, 369.9685, 205.2372], + [250.5236, 154.2844, 274.1773, 204.9810], + [471.4920, 160.4118, 496.0094, 213.4244], + [352.0421, 164.5933, 366.4458, 205.9624], + [166.0765, 169.7707, 183.0102, 232.6606], + [252.3000, 183.1449, 269.6541, 210.6747], + [469.7862, 162.0192, 482.1673, 187.0053], + [168.4862, 174.2567, 181.7437, 232.9379], + [470.3290, 162.3442, 496.4272, 214.6296], + [251.0450, 155.5911, 272.2693, 203.3675], + [252.0326, 154.7950, 273.7404, 195.3671], + [351.7479, 161.9567, 370.6432, 204.3047], + [496.3306, 161.7157, 515.0573, 210.7200], + [471.0749, 162.6143, 485.3374, 207.3448], + [250.9745, 160.7633, 264.1924, 206.8350], + [470.4792, 169.0351, 487.1934, 220.2984], + [474.4227, 161.9546, 513.1018, 215.5193], + [251.9428, 184.1950, 262.6937, 207.6416], + [252.6623, 175.0252, 269.8806, 213.7584], + [260.9884, 157.0351, 288.3554, 206.6027], + [251.3629, 164.5101, 263.2179, 202.4203], + [471.8361, 190.8142, 485.6812, 220.8586], + [248.6243, 156.9628, 264.3355, 199.2767], + [495.1643, 158.0483, 512.6261, 184.4192], + [376.8718, 168.0144, 387.3584, 201.3210], + [122.9191, 160.7433, 172.5612, 231.3837], + [350.3857, 175.8806, 366.2500, 205.4329], + [115.2958, 162.7822, 161.9776, 229.6147], + [168.4375, 177.4041, 180.8028, 232.4551], + [169.7939, 184.4330, 181.4767, 232.1220], + [347.7536, 175.9356, 355.8637, 197.5586], + [495.5434, 164.6059, 516.4031, 207.7053], + [172.1216, 194.6033, 183.1217, 235.2653], + [264.2654, 181.5540, 288.4626, 214.0170], + [111.7971, 183.7748, 137.3745, 225.9724], + [253.4919, 186.3945, 280.8694, 210.0731], + [165.5334, 169.7344, 185.9159, 232.8514], + [348.3662, 184.5187, 354.9081, 201.4038], + [164.6562, 162.5724, 186.3108, 233.5010], + [113.2999, 186.8410, 135.8841, 219.7642], + [117.0282, 179.8009, 142.5375, 221.0736], + [462.1312, 161.1004, 495.3576, 217.2208], + [462.5800, 159.9310, 501.2937, 224.1655], + [503.5242, 170.0733, 518.3792, 209.0113], + [250.3658, 195.5925, 260.6523, 212.4679], + [108.8287, 163.6994, 146.3642, 229.7261], + [256.7617, 187.3123, 288.8407, 211.2013], + [161.2781, 167.4801, 186.3751, 232.7133], + [115.3760, 177.5859, 163.3512, 236.9660], + [248.9077, 188.0919, 264.8579, 207.9718], + [108.1349, 160.7851, 143.6370, 229.6243], + [465.0900, 156.7555, 490.3561, 213.5704], + [107.5338, 173.4323, 141.0704, 235.2910], + ] + ).astype(np.float32) + ) + scores = torch.from_numpy( + np.array( + [ + 0.1919, + 0.3293, + 0.0860, + 0.1600, + 0.1885, + 0.4297, + 0.0974, + 0.2711, + 0.1483, + 0.1173, + 0.1034, + 0.2915, + 0.1993, + 0.0677, + 0.3217, + 0.0966, + 0.0526, + 0.5675, + 0.3130, + 0.1592, + 0.1353, + 0.0634, + 0.1557, + 0.1512, + 0.0699, + 0.0545, + 0.2692, + 0.1143, + 0.0572, + 0.1990, + 0.0558, + 0.1500, + 0.2214, + 0.1878, + 0.2501, + 0.1343, + 0.0809, + 0.1266, + 0.0743, + 0.0896, + 0.0781, + 0.0983, + 0.0557, + 0.0623, + 0.5808, + 0.3090, + 0.1050, + 0.0524, + 0.0513, + 0.4501, + 0.4167, + 0.0623, + 0.1749, + ] + ).astype(np.float32) + ) + + gt_indices = np.array( + [ + 1, + 6, + 7, + 8, + 11, + 12, + 13, + 14, + 17, + 18, + 19, + 21, + 23, + 24, + 25, + 26, + 30, + 32, + 33, + 34, + 35, + 37, + 43, + 44, + 47, + 50, + ] + ) + keep_indices = box_nms(boxes, scores, 0.5) + keep_indices = np.sort(keep_indices) + + np.testing.assert_array_equal(keep_indices, gt_indices) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_predictors.py b/tests/test_predictors.py new file mode 100644 index 0000000000000000000000000000000000000000..b22c2524630ce937141c56238808450ac7efa307 --- /dev/null +++ b/tests/test_predictors.py @@ -0,0 +1,98 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. + +import unittest +import copy +import torch +# import modules to to register predictors +from maskrcnn_benchmark.modeling.backbone import build_backbone # NoQA +from maskrcnn_benchmark.modeling.roi_heads.roi_heads import build_roi_heads # NoQA +from maskrcnn_benchmark.modeling import registry +from maskrcnn_benchmark.config import cfg as g_cfg +from utils import load_config + + +# overwrite configs if specified, otherwise default config is used +PREDICTOR_CFGS = { +} + +# overwrite configs if specified, otherwise default config is used +PREDICTOR_INPUT_CHANNELS = { +} + + +def _test_predictors( + self, predictors, overwrite_cfgs, overwrite_in_channels, + hwsize, +): + ''' Make sure predictors run ''' + + self.assertGreater(len(predictors), 0) + + in_channels_default = 64 + + for name, builder in predictors.items(): + print('Testing {}...'.format(name)) + if name in overwrite_cfgs: + cfg = load_config(overwrite_cfgs[name]) + else: + # Use default config if config file is not specified + cfg = copy.deepcopy(g_cfg) + + in_channels = overwrite_in_channels.get( + name, in_channels_default) + + fe = builder(cfg, in_channels) + + N, C_in, H, W = 2, in_channels, hwsize, hwsize + input = torch.rand([N, C_in, H, W], dtype=torch.float32) + out = fe(input) + yield input, out, cfg + + +class TestPredictors(unittest.TestCase): + def test_roi_box_predictors(self): + ''' Make sure roi box predictors run ''' + for cur_in, cur_out, cur_cfg in _test_predictors( + self, + registry.ROI_BOX_PREDICTOR, + PREDICTOR_CFGS, + PREDICTOR_INPUT_CHANNELS, + hwsize=1, + ): + self.assertEqual(len(cur_out), 2) + scores, bbox_deltas = cur_out[0], cur_out[1] + self.assertEqual( + scores.shape[1], cur_cfg.MODEL.ROI_BOX_HEAD.NUM_CLASSES) + self.assertEqual(scores.shape[0], cur_in.shape[0]) + self.assertEqual(scores.shape[0], bbox_deltas.shape[0]) + self.assertEqual(scores.shape[1] * 4, bbox_deltas.shape[1]) + + def test_roi_keypoints_predictors(self): + ''' Make sure roi keypoint predictors run ''' + for cur_in, cur_out, cur_cfg in _test_predictors( + self, + registry.ROI_KEYPOINT_PREDICTOR, + PREDICTOR_CFGS, + PREDICTOR_INPUT_CHANNELS, + hwsize=14, + ): + self.assertEqual(cur_out.shape[0], cur_in.shape[0]) + self.assertEqual( + cur_out.shape[1], cur_cfg.MODEL.ROI_KEYPOINT_HEAD.NUM_CLASSES) + + def test_roi_mask_predictors(self): + ''' Make sure roi mask predictors run ''' + for cur_in, cur_out, cur_cfg in _test_predictors( + self, + registry.ROI_MASK_PREDICTOR, + PREDICTOR_CFGS, + PREDICTOR_INPUT_CHANNELS, + hwsize=14, + ): + self.assertEqual(cur_out.shape[0], cur_in.shape[0]) + self.assertEqual( + cur_out.shape[1], cur_cfg.MODEL.ROI_BOX_HEAD.NUM_CLASSES) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_rpn_heads.py b/tests/test_rpn_heads.py new file mode 100644 index 0000000000000000000000000000000000000000..6cf3b34f630f81904e9e0d5fa0a4a1984f3e9471 --- /dev/null +++ b/tests/test_rpn_heads.py @@ -0,0 +1,62 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. + +import unittest +import copy +import torch +# import modules to to register rpn heads +from maskrcnn_benchmark.modeling.backbone import build_backbone # NoQA +from maskrcnn_benchmark.modeling.rpn.rpn import build_rpn # NoQA +from maskrcnn_benchmark.modeling import registry +from maskrcnn_benchmark.config import cfg as g_cfg +from utils import load_config + + +# overwrite configs if specified, otherwise default config is used +RPN_CFGS = { +} + + +class TestRPNHeads(unittest.TestCase): + def test_build_rpn_heads(self): + ''' Make sure rpn heads run ''' + + self.assertGreater(len(registry.RPN_HEADS), 0) + + in_channels = 64 + num_anchors = 10 + + for name, builder in registry.RPN_HEADS.items(): + print('Testing {}...'.format(name)) + if name in RPN_CFGS: + cfg = load_config(RPN_CFGS[name]) + else: + # Use default config if config file is not specified + cfg = copy.deepcopy(g_cfg) + + rpn = builder(cfg, in_channels, num_anchors) + + N, C_in, H, W = 2, in_channels, 24, 32 + input = torch.rand([N, C_in, H, W], dtype=torch.float32) + LAYERS = 3 + out = rpn([input] * LAYERS) + self.assertEqual(len(out), 2) + logits, bbox_reg = out + for idx in range(LAYERS): + self.assertEqual( + logits[idx].shape, + torch.Size([ + input.shape[0], num_anchors, + input.shape[2], input.shape[3], + ]) + ) + self.assertEqual( + bbox_reg[idx].shape, + torch.Size([ + logits[idx].shape[0], num_anchors * 4, + logits[idx].shape[2], logits[idx].shape[3], + ]), + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/utils.py b/tests/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..454923e8aa05d59caaf4931715b711cdcb0b2691 --- /dev/null +++ b/tests/utils.py @@ -0,0 +1,28 @@ +from __future__ import absolute_import, division, print_function, unicode_literals + +# Set up custom environment before nearly anything else is imported +# NOTE: this should be the first import (no not reorder) +from maskrcnn_benchmark.utils.env import setup_environment # noqa F401 isort:skip +import env_tests.env as env_tests + +import os +import copy + +from maskrcnn_benchmark.config import cfg as g_cfg + + +def get_config_root_path(): + return env_tests.get_config_root_path() + + +def load_config(rel_path): + ''' Load config from file path specified as path relative to config_root ''' + cfg_path = os.path.join(env_tests.get_config_root_path(), rel_path) + return load_config_from_file(cfg_path) + + +def load_config_from_file(file_path): + ''' Load config from file path specified as absolute path ''' + ret = copy.deepcopy(g_cfg) + ret.merge_from_file(file_path) + return ret diff --git a/toolboxes/compute_threshold.py b/toolboxes/compute_threshold.py new file mode 100755 index 0000000000000000000000000000000000000000..de3274da7a52809e7b516113addfe708e77e0abb --- /dev/null +++ b/toolboxes/compute_threshold.py @@ -0,0 +1,52 @@ +import glob +import json +import os +from collections import defaultdict + +import numpy as np +from PIL import Image +from tqdm import tqdm + + +def compute(): + with open('instances_train2019.json') as fid: + data = json.load(fid) + images = {} + for x in data['images']: + images[x['id']] = x + + annotations = {} + for x in data['annotations']: + annotations[images[x['image_id']]['file_name']] = x + + object_paths = glob.glob(os.path.join('/data7/lufficc/process_rpc/cropped_train2019/', '*.jpg')) + + object_category_paths = defaultdict(list) + for path in object_paths: + name = os.path.basename(path) + category = annotations[name]['category_id'] + object_category_paths[category].append(path) + + object_category_paths = dict(object_category_paths) + + ratio_anns = {} + for category, paths in tqdm(object_category_paths.items()): + areas = [] + for object_path in paths: + name = os.path.basename(object_path) + mask_path = os.path.join('/data7/lufficc/rpc/object_masks/', '{}.png'.format(name.split('.')[0])) + mask = Image.open(mask_path).convert('L') + area = np.array(mask, dtype=np.bool).sum() + areas.append(area) + areas = np.array(areas) + max_area = areas.max() + ratios = np.round(areas / max_area, 3) + for i, object_path in enumerate(paths): + name = os.path.basename(object_path) + ratio_anns[name] = ratios[i] + with open('ratio_annotations.json', 'w') as fid: + json.dump(ratio_anns, fid) + + +if __name__ == '__main__': + compute() diff --git a/toolboxes/extract_mask.py b/toolboxes/extract_mask.py new file mode 100755 index 0000000000000000000000000000000000000000..c034b7df1016f266474ebd62d41cf611004335c5 --- /dev/null +++ b/toolboxes/extract_mask.py @@ -0,0 +1,96 @@ +import glob +import json +import os +import sys +from argparse import ArgumentParser + +import cv2 +import numpy as np +from scipy import ndimage +from tqdm import tqdm + + +def do_extract(path): + annotation = annotations[os.path.basename(path)] + bbox = annotation['bbox'] + x, y, w, h = [int(x) for x in bbox] + img = cv2.imread(path) + origin_height, origin_width = img.shape[:2] + + box_pad = 5 + crop_x1 = x - box_pad + crop_y1 = y - box_pad + crop_x2 = x + w + box_pad + crop_y2 = y + h + box_pad + + x = x - crop_x1 + y = y - crop_y1 + + origin_img = img = img[crop_y1:crop_y2, crop_x1:crop_x2] + + img = cv2.bilateralFilter(img, 3, 75, 75) + + # ------------------------- + # edge detect + # ------------------------- + edges = detector.detectEdges(np.float32(img) / 255) + + # ------------------------- + # edge process + # ------------------------- + object_box_mask = np.zeros_like(edges, dtype=np.uint8) + object_box_mask[y:y + h, x:x + w] = 1 + edges[(1 - object_box_mask) == 1] = 0 + edges[(edges < (edges.mean() * 0.5)) & (edges < 0.1)] = 0 + + # ------------------------- + # erode and dilate + # ------------------------- + filled = ndimage.binary_fill_holes(edges).astype(np.uint8) + filled = cv2.erode(filled, np.ones((32, 32), np.uint8)) + filled = cv2.dilate(filled, np.ones((32, 32), np.uint8)) + filled = cv2.erode(filled, np.ones((8, 8), np.uint8)) + + filled = cv2.medianBlur(filled, 17) + save_image = np.zeros((origin_height, origin_width), np.uint8) + save_image[crop_y1:crop_y2, crop_x1:crop_x2] = np.array(filled * 255, dtype=np.uint8) + cv2.imwrite(os.path.join(output_dir, os.path.basename(path).split('.')[0] + '.png'), save_image) + + masked_img = origin_img * filled[:, :, None] + compare_img = np.concatenate([origin_img, masked_img], axis=1) + cv2.imwrite(os.path.join(compare_dir, os.path.basename(path)), compare_img) + + +def extract(paths): + for path in tqdm(paths): + do_extract(path) + + +if __name__ == '__main__': + parser = ArgumentParser(description="Extract masks") + parser.add_argument('--ann_file', type=str, default='instances_train2019.json') + parser.add_argument('--images_dir', type=str, default='train2019') + parser.add_argument('--model_file', type=str, default='model.yml.gz') + args = parser.parse_args() + + with open(args.ann_file) as fid: + data = json.load(fid) + images = {} + for x in data['images']: + images[x['id']] = x + annotations = {} + for x in data['annotations']: + annotations[images[x['image_id']]['file_name']] = x + + output_dir = 'extracted_masks/masks' + compare_dir = 'extracted_masks/masked_images' + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + if not os.path.exists(compare_dir): + os.makedirs(compare_dir) + + categories = [i + 1 for i in range(200)] + paths = glob.glob(os.path.join(args.images_dir, '*.jpg')) + detector = cv2.ximgproc.createStructuredEdgeDetection(args.model_file) + extract(paths) diff --git a/toolboxes/lunch.py b/toolboxes/lunch.py new file mode 100644 index 0000000000000000000000000000000000000000..3fc6fa1c59266587475850ea9cfe518a282e282d --- /dev/null +++ b/toolboxes/lunch.py @@ -0,0 +1,35 @@ +import os +import subprocess +import sys +from argparse import ArgumentParser, REMAINDER + +if __name__ == '__main__': + parser = ArgumentParser(description="PyTorch distributed training launch " + "helper utilty that will spawn up " + "multiple distributed processes") + parser.add_argument('training_script_args', nargs=REMAINDER) + parser.add_argument('--script', + metavar="FILE", + help="path to script file", + type=str) + parser.add_argument('--count', type=int, default=4) + + current_env = os.environ.copy() + args = parser.parse_args() + + processes = [] + script = args.script + for local_rank in range(0, args.count): + cmd = [sys.executable, + "-u", + script, + "--local_rank={}".format(local_rank), + "--count={}".format(args.count)] + args.training_script_args + process = subprocess.Popen(cmd, env=current_env) + processes.append(process) + + for process in processes: + process.wait() + if process.returncode != 0: + raise subprocess.CalledProcessError(returncode=process.returncode, + cmd=process.args) diff --git a/toolboxes/model.yml.gz b/toolboxes/model.yml.gz new file mode 100755 index 0000000000000000000000000000000000000000..1a0138a40935d7ddc82cfac24b013265943471a9 Binary files /dev/null and b/toolboxes/model.yml.gz differ diff --git a/toolboxes/synthesize_images.py b/toolboxes/synthesize_images.py new file mode 100755 index 0000000000000000000000000000000000000000..d0df80627abb1d9f6fd70ccc89021947e8c33c1d --- /dev/null +++ b/toolboxes/synthesize_images.py @@ -0,0 +1,308 @@ +import glob +import json +import os +import random +import scipy +import time +from argparse import ArgumentParser +from collections import defaultdict +import cv2 +import numpy as np +from PIL import Image +from scipy import ndimage +from tqdm import tqdm + +NUM_CATEGORIES = 200 +GENERATED_NUM = 100000 + + +def buy_strategic(counter): + categories = [i + 1 for i in range(NUM_CATEGORIES)] + selected_categories = np.random.choice(categories, size=random.randint(3, 10), replace=False) + num_categories = len(selected_categories) + + if 3 <= num_categories < 5: # Easy mode: 3∼5 + num_instances = random.randint(num_categories, 10) + counter['easy_mode'] += 1 + elif 5 <= num_categories < 8: # Medium mode: 5∼8 + num_instances = random.randint(10, 15) + counter['medium_mode'] += 1 + else: # Hard mode: 8∼10 + num_instances = random.randint(15, 20) + counter['hard_mode'] += 1 + + num_per_category = {} + generated = 0 + for i, category in enumerate(selected_categories): + i += 1 + if i == num_categories: + count = num_instances - generated + else: + count = random.randint(1, num_instances - (num_categories - i) - generated) + generated += count + num_per_category[int(category)] = count + + return num_per_category + + +def check_iou(annotations, box, threshold=0.5): + """ + Args: + annotations: + box: (x, y, w, h) + threshold: + Returns: bool + """ + + cx1, cy1, cw, ch = box + cx2, cy2 = cx1 + cw, cy1 + ch + carea = cw * ch + for ann in annotations: + x1, y1, w, h = ann['bbox'] + x2, y2 = x1 + w, y1 + h + area = w * h + inter_x1 = max(x1, cx1) + inter_y1 = max(y1, cy1) + inter_x2 = min(x2, cx2) + inter_y2 = min(y2, cy2) + + inter_area = max(0, inter_x2 - inter_x1) * max(0, inter_y2 - inter_y1) + iou = inter_area / (carea + area - inter_area + 1e-8) # avoid division by zero + if iou > threshold: + return False + return True + + +def sample_select_object_index(category, paths, ratio_annotations, threshold=0.45): + high_threshold_paths = [path for path in paths if ratio_annotations[os.path.basename(path)] > threshold] + index = random.randint(0, len(high_threshold_paths) - 1) + path = high_threshold_paths[index] + return path + + +def generated_position(width, height, w, h, pad=0): + x = random.randint(pad, width - w - pad) + y = random.randint(pad, height - h - pad) + return x, y + + +def get_object_bbox(annotation): + bbox = annotation['bbox'] + x, y, w, h = [int(x) for x in bbox] + + box_pad = max(160, int(max(w, h) * 0.3)) + crop_x1 = max(0, x - box_pad) + crop_y1 = max(0, y - box_pad) + x = x - crop_x1 + y = y - crop_y1 + return x, y, w, h + + +def gaussian_filter_density(gt): + density = np.zeros(gt.shape, dtype=np.float32) + gt_count = np.count_nonzero(gt) + if gt_count == 0: + return density + pts = np.array(list(zip(np.nonzero(gt)[1], np.nonzero(gt)[0]))) # (x,y) + leaf_size = 2048 + # build kd tree + tree = scipy.spatial.KDTree(pts.copy(), leafsize=leaf_size) + # query kd tree + distances, locations = tree.query(pts, k=4) + + for i, pt in enumerate(pts): + pt2d = np.zeros(gt.shape, dtype=np.float32) + pt2d[pt[1], pt[0]] = 1. + if gt_count > 1: + sigma = (distances[i][1] + distances[i][2] + distances[i][3]) * 0.085 + sigma = min(sigma, 999) # avoid inf + else: + raise NotImplementedError('should not be here!!') + density += scipy.ndimage.filters.gaussian_filter(pt2d, sigma, mode='constant') + return density + + +def synthesize(strategics, save_json_file='', output_dir='', save_mask=False): + with open('ratio_annotations.json') as fid: + ratio_annotations = json.load(fid) + + with open('instances_train2019.json') as fid: + data = json.load(fid) + images = {} + for x in data['images']: + images[x['id']] = x + + annotations = {} + for x in data['annotations']: + annotations[images[x['image_id']]['file_name']] = x + + object_paths = glob.glob(os.path.join('/data7/lufficc/process_rpc/cropped_train2019/', '*.jpg')) + + object_category_paths = defaultdict(list) + for path in object_paths: + name = os.path.basename(path) + category = annotations[name]['category_id'] + object_category_paths[category].append(path) + object_category_paths = dict(object_category_paths) + + bg_img_cv = cv2.imread('bg.jpg') + bg_height, bg_width = bg_img_cv.shape[:2] + mask_img_cv = np.zeros((bg_height, bg_width), dtype=np.uint8) + + json_ann = [] + for image_id, num_per_category in tqdm(strategics): + bg_img = Image.fromarray(bg_img_cv) + mask_img = Image.fromarray(mask_img_cv) + synthesize_annotations = [] + for category, count in num_per_category.items(): + category = int(category) + for _ in range(count): + paths = object_category_paths[category] + + object_path = sample_select_object_index(category, paths, ratio_annotations, threshold=0.45) + + name = os.path.basename(object_path) + mask_path = os.path.join('/data7/lufficc/rpc/object_masks/', '{}.png'.format(name.split('.')[0])) + + obj = Image.open(object_path) + mask = Image.open(mask_path).convert('L') + + # dense object bbox + # --------------------------- + # Crop according to json annotation + # --------------------------- + x, y, w, h = get_object_bbox(annotations[name]) + obj = obj.crop((x, y, x + w, y + h)) + mask = mask.crop((x, y, x + w, y + h)) + + # --------------------------- + # Random scale + # --------------------------- + scale = random.uniform(0.4, 0.7) + w, h = int(w * scale), int(h * scale) + obj = obj.resize((w, h), resample=Image.BILINEAR) + mask = mask.resize((w, h), resample=Image.BILINEAR) + + # --------------------------- + # Random rotate + # --------------------------- + angle = random.random() * 360 + obj = obj.rotate(angle, resample=Image.BILINEAR, expand=1) + mask = mask.rotate(angle, resample=Image.BILINEAR, expand=1) + + # --------------------------- + # Crop according to mask + # --------------------------- + where = np.where(np.array(mask)) + y1, x1 = np.amin(where, axis=1) + y2, x2 = np.amax(where, axis=1) + obj = obj.crop((x1, y1, x2, y2)) + mask = mask.crop((x1, y1, x2, y2)) + w, h = obj.width, obj.height + + pad = 2 + pos_x, pos_y = generated_position(bg_width, bg_height, w, h, pad) + start = time.time() + threshold = 0.5 + while not check_iou(synthesize_annotations, box=(pos_x, pos_y, w, h), threshold=threshold): + if (time.time() - start) > 3: # cannot find a valid position in 3 seconds + start = time.time() + threshold += 0.1 + continue + pos_x, pos_y = generated_position(bg_width, bg_height, w, h, pad) + + bg_img.paste(obj, box=(pos_x, pos_y), mask=mask) + if save_mask: + mask_img.paste(mask, box=(pos_x, pos_y), mask=mask) + + # --------------------------- + # Find center of mass + # --------------------------- + mask_array = np.array(mask) + center_of_mass = ndimage.measurements.center_of_mass(mask_array) # y, x + center_of_mass = [int(round(x)) for x in center_of_mass] + center_of_mass = center_of_mass[1] + pos_x, center_of_mass[0] + pos_y # map to whole image + + synthesize_annotations.append({ + 'bbox': (pos_x, pos_y, w, h), + 'category_id': category, + 'center_of_mass': center_of_mass, + }) + + assert bg_height == 1815 and bg_width == 1815 + scale = 200.0 / 1815 + gt = np.zeros((round(bg_height * scale), round(bg_width * scale))) + for item in synthesize_annotations: + center_of_mass = item['center_of_mass'] + gt[round(center_of_mass[1] * scale), round(center_of_mass[0] * scale)] = 1 + + assert gt.shape[0] == 200 and gt.shape[1] == 200 + + density = gaussian_filter_density(gt) + image_name = '{}.jpg'.format(image_id) + + bg_img.save(os.path.join(output_dir, image_name)) + np.save(os.path.join(output_dir, 'density_maps', image_id), density) + + # plt.subplot(121) + # plt.imshow(density, cmap='gray') + # + # plt.subplot(122) + # plt.imshow(bg_img) + # + # print(len(synthesize_annotations)) + # print(density.sum()) + # plt.show() + # quit() + + if save_mask: + mask_img.save(os.path.join(output_dir, 'masks', image_name)) + json_ann.append({ + 'image_id': image_name, + 'objects': synthesize_annotations + }) + if save_json_file: + with open(save_json_file, 'w') as fid: + json.dump(json_ann, fid) + + +if __name__ == '__main__': + parser = ArgumentParser(description="Synthesize fake images") + parser.add_argument('--count', type=int, default=32) + parser.add_argument('--local_rank', type=int, default=0) + args = parser.parse_args() + + # counter = { + # 'easy_mode': 0, + # 'medium_mode': 0, + # 'hard_mode': 0 + # } + # strategics = [] + # for image_id in tqdm(range(GENERATED_NUM)): + # num_per_category = buy_strategic(counter) + # strategics.append(('synthesized_image_{}'.format(image_id), num_per_category)) + # + # if os.path.exists('strategics.json'): + # os.remove('strategics.json') + # with open('strategics.json', 'w') as f: + # json.dump(strategics, f) + # print(counter) # {'easy_mode': 25078, 'medium_mode': 37287, 'hard_mode': 37635} + # quit() + + with open('strategics.json') as f: + strategics = json.load(f) + strategics = sorted(strategics, key=lambda s: s[0]) + version = 'density_map_0_45_threshold' + + output_dir = os.path.join('synthesize_{}'.format(version)) + if not os.path.exists(output_dir): + os.mkdir(output_dir) + + if not os.path.exists(os.path.join(output_dir, 'density_maps')): + os.mkdir(os.path.join(output_dir, 'density_maps')) + + threads = [] + num_threads = args.count + sub_strategics = strategics[args.local_rank::num_threads] + save_file = 'synthesize_{}_{}.json'.format(version, args.local_rank) + synthesize(sub_strategics, save_file, output_dir) diff --git a/tools/cityscapes/convert_cityscapes_to_coco.py b/tools/cityscapes/convert_cityscapes_to_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..26c8b9cea417c9f613a4d85c6cfca2a2d6501165 --- /dev/null +++ b/tools/cityscapes/convert_cityscapes_to_coco.py @@ -0,0 +1,237 @@ +#!/usr/bin/env python + +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## + +# This file is copy from https://github.com/facebookresearch/Detectron/tree/master/tools + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import argparse +import h5py +import json +import os +import scipy.misc +import sys + +import cityscapesscripts.evaluation.instances2dict_with_polygons as cs + + +def parse_args(): + parser = argparse.ArgumentParser(description='Convert dataset') + parser.add_argument( + '--dataset', help="cocostuff, cityscapes", default=None, type=str) + parser.add_argument( + '--outdir', help="output dir for json files", default=None, type=str) + parser.add_argument( + '--datadir', help="data dir for annotations to be converted", + default=None, type=str) + if len(sys.argv) == 1: + parser.print_help() + sys.exit(1) + return parser.parse_args() + + +def poly_to_box(poly): + """Convert a polygon into a tight bounding box.""" + x0 = min(min(p[::2]) for p in poly) + x1 = max(max(p[::2]) for p in poly) + y0 = min(min(p[1::2]) for p in poly) + y1 = max(max(p[1::2]) for p in poly) + box_from_poly = [x0, y0, x1, y1] + + return box_from_poly + +def xyxy_to_xywh(xyxy_box): + xmin, ymin, xmax, ymax = xyxy_box + TO_REMOVE = 1 + xywh_box = (xmin, ymin, xmax - xmin + TO_REMOVE, ymax - ymin + TO_REMOVE) + return xywh_box + + +def convert_coco_stuff_mat(data_dir, out_dir): + """Convert to png and save json with path. This currently only contains + the segmentation labels for objects+stuff in cocostuff - if we need to + combine with other labels from original COCO that will be a TODO.""" + sets = ['train', 'val'] + categories = [] + json_name = 'coco_stuff_%s.json' + ann_dict = {} + for data_set in sets: + file_list = os.path.join(data_dir, '%s.txt') + images = [] + with open(file_list % data_set) as f: + for img_id, img_name in enumerate(f): + img_name = img_name.replace('coco', 'COCO').strip('\n') + image = {} + mat_file = os.path.join( + data_dir, 'annotations/%s.mat' % img_name) + data = h5py.File(mat_file, 'r') + labelMap = data.get('S') + if len(categories) == 0: + labelNames = data.get('names') + for idx, n in enumerate(labelNames): + categories.append( + {"id": idx, "name": ''.join(chr(i) for i in data[ + n[0]])}) + ann_dict['categories'] = categories + scipy.misc.imsave( + os.path.join(data_dir, img_name + '.png'), labelMap) + image['width'] = labelMap.shape[0] + image['height'] = labelMap.shape[1] + image['file_name'] = img_name + image['seg_file_name'] = img_name + image['id'] = img_id + images.append(image) + ann_dict['images'] = images + print("Num images: %s" % len(images)) + with open(os.path.join(out_dir, json_name % data_set), 'wb') as outfile: + outfile.write(json.dumps(ann_dict)) + + +# for Cityscapes +def getLabelID(self, instID): + if (instID < 1000): + return instID + else: + return int(instID / 1000) + + +def convert_cityscapes_instance_only( + data_dir, out_dir): + """Convert from cityscapes format to COCO instance seg format - polygons""" + sets = [ + 'gtFine_val', + 'gtFine_train', + 'gtFine_test', + + # 'gtCoarse_train', + # 'gtCoarse_val', + # 'gtCoarse_train_extra' + ] + ann_dirs = [ + 'gtFine_trainvaltest/gtFine/val', + 'gtFine_trainvaltest/gtFine/train', + 'gtFine_trainvaltest/gtFine/test', + + # 'gtCoarse/train', + # 'gtCoarse/train_extra', + # 'gtCoarse/val' + ] + json_name = 'instancesonly_filtered_%s.json' + ends_in = '%s_polygons.json' + img_id = 0 + ann_id = 0 + cat_id = 1 + category_dict = {} + + category_instancesonly = [ + 'person', + 'rider', + 'car', + 'truck', + 'bus', + 'train', + 'motorcycle', + 'bicycle', + ] + + for data_set, ann_dir in zip(sets, ann_dirs): + print('Starting %s' % data_set) + ann_dict = {} + images = [] + annotations = [] + ann_dir = os.path.join(data_dir, ann_dir) + + for root, _, files in os.walk(ann_dir): + for filename in files: + if filename.endswith(ends_in % data_set.split('_')[0]): + if len(images) % 50 == 0: + print("Processed %s images, %s annotations" % ( + len(images), len(annotations))) + json_ann = json.load(open(os.path.join(root, filename))) + image = {} + image['id'] = img_id + img_id += 1 + + image['width'] = json_ann['imgWidth'] + image['height'] = json_ann['imgHeight'] + image['file_name'] = filename[:-len( + ends_in % data_set.split('_')[0])] + 'leftImg8bit.png' + image['seg_file_name'] = filename[:-len( + ends_in % data_set.split('_')[0])] + \ + '%s_instanceIds.png' % data_set.split('_')[0] + images.append(image) + + fullname = os.path.join(root, image['seg_file_name']) + objects = cs.instances2dict_with_polygons( + [fullname], verbose=False)[fullname] + + for object_cls in objects: + if object_cls not in category_instancesonly: + continue # skip non-instance categories + + for obj in objects[object_cls]: + if obj['contours'] == []: + print('Warning: empty contours.') + continue # skip non-instance categories + + len_p = [len(p) for p in obj['contours']] + if min(len_p) <= 4: + print('Warning: invalid contours.') + continue # skip non-instance categories + + ann = {} + ann['id'] = ann_id + ann_id += 1 + ann['image_id'] = image['id'] + ann['segmentation'] = obj['contours'] + + if object_cls not in category_dict: + category_dict[object_cls] = cat_id + cat_id += 1 + ann['category_id'] = category_dict[object_cls] + ann['iscrowd'] = 0 + ann['area'] = obj['pixelCount'] + + xyxy_box = poly_to_box(ann['segmentation']) + xywh_box = xyxy_to_xywh(xyxy_box) + ann['bbox'] = xywh_box + + annotations.append(ann) + + ann_dict['images'] = images + categories = [{"id": category_dict[name], "name": name} for name in + category_dict] + ann_dict['categories'] = categories + ann_dict['annotations'] = annotations + print("Num categories: %s" % len(categories)) + print("Num images: %s" % len(images)) + print("Num annotations: %s" % len(annotations)) + with open(os.path.join(out_dir, json_name % data_set), 'w') as outfile: + outfile.write(json.dumps(ann_dict)) + + +if __name__ == '__main__': + args = parse_args() + if args.dataset == "cityscapes_instance_only": + convert_cityscapes_instance_only(args.datadir, args.outdir) + elif args.dataset == "cocostuff": + convert_coco_stuff_mat(args.datadir, args.outdir) + else: + print("Dataset not supported: %s" % args.dataset) diff --git a/tools/cityscapes/instances2dict_with_polygons.py b/tools/cityscapes/instances2dict_with_polygons.py new file mode 100644 index 0000000000000000000000000000000000000000..0bb5604978b863c816fbee257ce3674d4afc5d55 --- /dev/null +++ b/tools/cityscapes/instances2dict_with_polygons.py @@ -0,0 +1,81 @@ +#!/usr/bin/python +# +# Convert instances from png files to a dictionary +# This files is created according to https://github.com/facebookresearch/Detectron/issues/111 + +from __future__ import print_function, absolute_import, division +import os, sys + +sys.path.append( os.path.normpath( os.path.join( os.path.dirname( __file__ ) , '..' , 'helpers' ) ) ) +from csHelpers import * + +# Cityscapes imports +from cityscapesscripts.evaluation.instance import * +from cityscapesscripts.helpers.csHelpers import * +import cv2 +from maskrcnn_benchmark.utils import cv2_util + + +def instances2dict_with_polygons(imageFileList, verbose=False): + imgCount = 0 + instanceDict = {} + + if not isinstance(imageFileList, list): + imageFileList = [imageFileList] + + if verbose: + print("Processing {} images...".format(len(imageFileList))) + + for imageFileName in imageFileList: + # Load image + img = Image.open(imageFileName) + + # Image as numpy array + imgNp = np.array(img) + + # Initialize label categories + instances = {} + for label in labels: + instances[label.name] = [] + + # Loop through all instance ids in instance image + for instanceId in np.unique(imgNp): + if instanceId < 1000: + continue + instanceObj = Instance(imgNp, instanceId) + instanceObj_dict = instanceObj.toDict() + + #instances[id2label[instanceObj.labelID].name].append(instanceObj.toDict()) + if id2label[instanceObj.labelID].hasInstances: + mask = (imgNp == instanceId).astype(np.uint8) + contour, hier = cv2_util.findContours( + mask.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE) + + polygons = [c.reshape(-1).tolist() for c in contour] + instanceObj_dict['contours'] = polygons + + instances[id2label[instanceObj.labelID].name].append(instanceObj_dict) + + imgKey = os.path.abspath(imageFileName) + instanceDict[imgKey] = instances + imgCount += 1 + + if verbose: + print("\rImages Processed: {}".format(imgCount), end=' ') + sys.stdout.flush() + + if verbose: + print("") + + return instanceDict + +def main(argv): + fileList = [] + if (len(argv) > 2): + for arg in argv: + if ("png" in arg): + fileList.append(arg) + instances2dict_with_polygons(fileList, True) + +if __name__ == "__main__": + main(sys.argv[1:]) diff --git a/tools/parse_correct.py b/tools/parse_correct.py new file mode 100644 index 0000000000000000000000000000000000000000..8e2e70e571176014727ad718268b2f6267403ad1 --- /dev/null +++ b/tools/parse_correct.py @@ -0,0 +1,53 @@ +import argparse +import json +from collections import defaultdict + + +def main(): + parser = argparse.ArgumentParser(description="DPNet Parse Correct") + parser.add_argument( + "--pseudo_label", + required=True, + metavar="FILE", + help="path to pseudo file", + type=str, + ) + parser.add_argument( + "--ann_file", + default='/data7/lufficc/rpc/instances_test2019.json', + metavar="FILE", + help="path to gt annotation file", + type=str, + ) + + args = parser.parse_args() + + with open(args.ann_file) as fid: + gt_annotations = json.load(fid) + + annotations = defaultdict(list) + images = [] + for image in gt_annotations['images']: + images.append(image) + for ann in gt_annotations['annotations']: + bbox = ann['bbox'] + x, y, w, h = bbox[0], bbox[1], bbox[2], bbox[3] + annotations[ann['image_id']].append((ann['category_id'], x, y, w, h)) + del gt_annotations + + with open(args.pseudo_label) as fid: + pseudo_annotation = json.load(fid) + + correct = 0 + for ann in pseudo_annotation: + pseudo_labels = [item[0] for item in ann['bbox']] + + gt_labels = [item[0] for item in annotations[ann['id']]] + + if sorted(pseudo_labels) == sorted(gt_labels): + correct += 1 + print('Ratio: {:.3f} ({}/{})'.format(correct / len(pseudo_annotation), correct, len(pseudo_annotation))) + + +if __name__ == "__main__": + main() diff --git a/tools/test_net.py b/tools/test_net.py new file mode 100644 index 0000000000000000000000000000000000000000..2bf69c5ac6f9167ab9c24a8c809824e8900441e1 --- /dev/null +++ b/tools/test_net.py @@ -0,0 +1,116 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +# Set up custom environment before nearly anything else is imported +# NOTE: this should be the first import (no not reorder) +from maskrcnn_benchmark.modeling.detector.adaption_rcnn import AdaptionRCNN +from maskrcnn_benchmark.utils.env import setup_environment # noqa F401 isort:skip + +import argparse +import os + +import torch +from maskrcnn_benchmark.config import cfg +from maskrcnn_benchmark.data import make_data_loader +from maskrcnn_benchmark.engine.inference import inference +from maskrcnn_benchmark.modeling.detector import build_detection_model +from maskrcnn_benchmark.utils.checkpoint import DetectronCheckpointer +from maskrcnn_benchmark.utils.collect_env import collect_env_info +from maskrcnn_benchmark.utils.comm import synchronize, get_rank +from maskrcnn_benchmark.utils.logger import setup_logger +from maskrcnn_benchmark.utils.miscellaneous import mkdir + + +def do_test(cfg, model, distributed, **kwargs): + if isinstance(model, torch.nn.parallel.DistributedDataParallel): + model = model.module + iou_types = ("bbox",) + if cfg.MODEL.MASK_ON: + iou_types = iou_types + ("segm",) + if cfg.MODEL.KEYPOINT_ON: + iou_types = iou_types + ("keypoints",) + output_folders = [None] * len(cfg.DATASETS.TEST) + dataset_names = kwargs.pop('dataset_names', cfg.DATASETS.TEST) + if cfg.OUTPUT_DIR: + for idx, dataset_name in enumerate(dataset_names): + output_folder = os.path.join(cfg.OUTPUT_DIR, "inference", dataset_name) + mkdir(output_folder) + output_folders[idx] = output_folder + data_loaders_val = make_data_loader(cfg, is_train=False, is_distributed=distributed, datasets=kwargs.pop('datasets', None)) + test_results = [] + generate_pseudo_labels = kwargs.pop('generate_pseudo_labels', cfg.TEST.GENERATE_PSEUDO_LABELS) + if isinstance(model, AdaptionRCNN): + model.generate_pseudo_labels = generate_pseudo_labels + + for output_folder, dataset_name, data_loader_val in zip(output_folders, dataset_names, data_loaders_val): + result = inference( + model, + data_loader_val, + dataset_name=dataset_name, + iou_types=iou_types, + box_only=False if cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY, + generate_pseudo_labels=generate_pseudo_labels, + device=cfg.MODEL.DEVICE, + expected_results=cfg.TEST.EXPECTED_RESULTS, + expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL, + output_folder=output_folder, + **kwargs + ) + test_results.append(result) + synchronize() + return test_results + + +def main(): + parser = argparse.ArgumentParser(description="PyTorch Object Detection Inference") + parser.add_argument( + "--config-file", + default="/private/home/fmassa/github/detectron.pytorch_v2/configs/e2e_faster_rcnn_R_50_C4_1x_caffe2.yaml", + metavar="FILE", + help="path to config file", + ) + parser.add_argument("--local_rank", type=int, default=0) + parser.add_argument("--benchmark", + help='enable `torch.backends.cudnn.benchmark`', + action="store_true") + parser.add_argument( + "opts", + help="Modify config options using the command-line", + default=None, + nargs=argparse.REMAINDER, + ) + + args = parser.parse_args() + + num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 + distributed = num_gpus > 1 + + if distributed: + torch.backends.cudnn.benchmark = args.benchmark + torch.cuda.set_device(args.local_rank) + torch.distributed.init_process_group( + backend="nccl", init_method="env://" + ) + synchronize() + + cfg.merge_from_file(args.config_file) + cfg.merge_from_list(args.opts) + cfg.freeze() + + save_dir = "" + logger = setup_logger("maskrcnn_benchmark", save_dir, get_rank()) + logger.info("Using {} GPUs".format(num_gpus)) + logger.info(cfg) + + logger.info("Collecting env info (might take some time)") + logger.info("\n" + collect_env_info()) + + model = build_detection_model(cfg) + model.to(cfg.MODEL.DEVICE) + + output_dir = cfg.OUTPUT_DIR + checkpointer = DetectronCheckpointer(cfg, model, save_dir=output_dir) + _ = checkpointer.load(cfg.MODEL.WEIGHT) + do_test(cfg, model, distributed) + + +if __name__ == "__main__": + main() diff --git a/tools/train_net.py b/tools/train_net.py new file mode 100644 index 0000000000000000000000000000000000000000..82e30eb0bbff37a73c453ec5115a87bbd0d2ae10 --- /dev/null +++ b/tools/train_net.py @@ -0,0 +1,197 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +r""" +Basic training script for PyTorch +""" + +# Set up custom environment before nearly anything else is imported +# NOTE: this should be the first import (no not reorder) +from maskrcnn_benchmark.utils.env import setup_environment # noqa F401 isort:skip + +import argparse +import logging +import os + +import torch +from maskrcnn_benchmark.config import cfg +from maskrcnn_benchmark.data import make_data_loader +from maskrcnn_benchmark.solver import make_lr_scheduler +from maskrcnn_benchmark.solver import make_optimizer +from maskrcnn_benchmark.engine.inference import inference +from maskrcnn_benchmark.engine.trainer import do_train, cross_do_train +from maskrcnn_benchmark.modeling.detector import build_detection_model +from maskrcnn_benchmark.utils.checkpoint import DetectronCheckpointer +from maskrcnn_benchmark.utils.collect_env import collect_env_info +from maskrcnn_benchmark.utils.comm import synchronize, get_rank +from maskrcnn_benchmark.utils.imports import import_file +from maskrcnn_benchmark.utils.logger import setup_logger +from maskrcnn_benchmark.utils.miscellaneous import mkdir + + +def train(cfg, local_rank, distributed): + model = build_detection_model(cfg) + device = torch.device(cfg.MODEL.DEVICE) + model.to(device) + + optimizer = make_optimizer(cfg, model) + scheduler = make_lr_scheduler(cfg, optimizer) + + if distributed: + model = torch.nn.parallel.DistributedDataParallel( + model, device_ids=[local_rank], output_device=local_rank, + # this should be removed if we update BatchNorm stats + broadcast_buffers=False, + ) + + arguments = {} + arguments["iteration"] = 0 + + output_dir = cfg.OUTPUT_DIR + + save_to_disk = get_rank() == 0 + checkpointer = DetectronCheckpointer( + cfg, model, None, None, output_dir, save_to_disk + ) + extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT) + # arguments.update(extra_checkpoint_data) + + checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD + + logger = logging.getLogger("maskrcnn_benchmark.trainer") + if cfg.MODEL.META_ARCHITECTURE == 'AdaptionRCNN': + logger.info('AdaptionRCNN trainer is adapted!') + cross_do_train( + cfg, + model, + optimizer, + scheduler, + checkpointer, + device, + checkpoint_period, + arguments, + distributed, + ) + elif cfg.MODEL.META_ARCHITECTURE == 'GeneralizedRCNN': + logger.info('GeneralizedRCNN trainer is adapted!') + data_loader = make_data_loader( + cfg, + is_train=True, + is_distributed=distributed, + start_iter=arguments["iteration"], + ) + do_train( + cfg, + model, + data_loader, + optimizer, + scheduler, + checkpointer, + device, + checkpoint_period, + arguments, + distributed, + ) + + return model + + +def run_test(cfg, model, distributed): + if distributed: + model = model.module + torch.cuda.empty_cache() # TODO check if it helps + iou_types = ("bbox",) + if cfg.MODEL.MASK_ON: + iou_types = iou_types + ("segm",) + if cfg.MODEL.KEYPOINT_ON: + iou_types = iou_types + ("keypoints",) + output_folders = [None] * len(cfg.DATASETS.TEST) + dataset_names = cfg.DATASETS.TEST + if cfg.OUTPUT_DIR: + for idx, dataset_name in enumerate(dataset_names): + output_folder = os.path.join(cfg.OUTPUT_DIR, "inference", dataset_name) + mkdir(output_folder) + output_folders[idx] = output_folder + data_loaders_val = make_data_loader(cfg, is_train=False, is_distributed=distributed) + for output_folder, dataset_name, data_loader_val in zip(output_folders, dataset_names, data_loaders_val): + inference( + model, + data_loader_val, + dataset_name=dataset_name, + iou_types=iou_types, + box_only=False if cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY, + generate_pseudo_labels=cfg.TEST.GENERATE_PSEUDO_LABELS, + device=cfg.MODEL.DEVICE, + expected_results=cfg.TEST.EXPECTED_RESULTS, + expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL, + output_folder=output_folder, + ) + synchronize() + + +def main(): + parser = argparse.ArgumentParser(description="PyTorch Object Detection Training") + parser.add_argument( + "--config-file", + default="", + metavar="FILE", + help="path to config file", + type=str, + ) + parser.add_argument("--local_rank", type=int, default=0) + parser.add_argument( + "--skip-test", + dest="skip_test", + help="Do not test the final model", + action="store_true", + ) + parser.add_argument("--benchmark", + help='enable `torch.backends.cudnn.benchmark`', + action="store_true") + parser.add_argument( + "opts", + help="Modify config options using the command-line", + default=None, + nargs=argparse.REMAINDER, + ) + + args = parser.parse_args() + + num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 + args.distributed = num_gpus > 1 + + if args.distributed: + torch.backends.cudnn.benchmark = args.benchmark + torch.cuda.set_device(args.local_rank) + torch.distributed.init_process_group( + backend="nccl", init_method="env://" + ) + synchronize() + + cfg.merge_from_file(args.config_file) + cfg.merge_from_list(args.opts) + cfg.freeze() + + output_dir = cfg.OUTPUT_DIR + if output_dir: + mkdir(output_dir) + + logger = setup_logger("maskrcnn_benchmark", output_dir, get_rank()) + logger.info("Using {} GPUs".format(num_gpus)) + logger.info(args) + + logger.info("Collecting env info (might take some time)") + logger.info("\n" + collect_env_info()) + + logger.info("Loaded configuration file {}".format(args.config_file)) + with open(args.config_file, "r") as cf: + config_str = "\n" + cf.read() + logger.info(config_str) + logger.info("Running with config:\n{}".format(cfg)) + + model = train(cfg, args.local_rank, args.distributed) + + if not args.skip_test: + run_test(cfg, model, args.distributed) + + +if __name__ == "__main__": + main()