Ball Tracking

  1. #!/usr/bin/env python
  2. # encoding: utf-8
  3. # import the necessary packages
  4. from collections import deque
  5. from imutils.video import VideoStream
  6. import numpy as np
  7. import argparse
  8. import cv2
  9. import imutils
  10. import time
  11. # construct the argument parse and parse the arguments
  12. ap = argparse.ArgumentParser()
  13. ap.add_argument("-v", "--video", help="path to the (optional) video file")
  14. ap.add_argument("-b", "--buffer", type=int, default=64, help="max buffer size")
  15. args = vars(ap.parse_args())
  16. # define the lower and upper boundaries of the "green"
  17. # ball in the HSV color space,
  18. # then initialize the list of tracked points
  19. greenLower = (29, 86, 6)
  20. greenUpper = (64, 255, 255)
  21. pts = deque(maxlen=args["buffer"])
  22. # if a video path was not supplied, grab the reference to the webcam
  23. if not args.get("video", False):
  24. vs = VideoStream(src=0).start()
  25. # otherwise, grab a reference to the video file
  26. else:
  27. vs = cv2.VideoCapture(args["video"])
  28. # allow the camera or video file to warm up
  29. time.sleep(2.0)
  30. # keep looping
  31. while True:
  32. # grab the current frame
  33. frame = vs.read()
  34. # handle the frame from VideoCapture or VideoStream
  35. frame = frame[1] if args.get("video", False) else frame
  36. # if we are viewing a video and we did not grab a frame,
  37. # then we have reached the end of the video
  38. if frame is None:
  39. break
  40. # resize the frame, blur it, and convert it to the HSV color space
  41. frame = imutils.resize(frame, width=600)
  42. blurred = cv2.GaussianBlur(frame, (11, 11), 0)
  43. hsv = cv2.cvtColor(blurred, cv2.COLOR_BGR2HSV)
  44. # construct a mask for the color "green",
  45. # then perform a series of dilations and
  46. # erosions to remove any small blobs left in the mask
  47. mask = cv2.inRange(hsv, greenLower, greenUpper)
  48. mask = cv2.erode(mask, None, iterations=2)
  49. mask = cv2.dilate(mask, None, iterations=2)
  50. # find contours in the mask and initialize the current
  51. # (x, y) center of the ball
  52. cnts = cv2.findContours(mask.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
  53. cnts = imutils.grab_contours(cnts)
  54. center = None
  55. # only proceed if at least one contour was found
  56. if len(cnts) > 0:
  57. # find the largest contour in the mask,
  58. # then use it to compute the minimum enclosing circle
  59. # and centroid
  60. c = max(cnts, key=cv2.contourArea)
  61. ((x, y), radius) = cv2.minEnclosingCircle(c)
  62. M = cv2.moments(c)
  63. center = (int(M["m10"] / M["m00"]), int(M["m01"] / M["m00"]))
  64. # only proceed if the radius meets a minimum size
  65. if radius > 10:
  66. # draw the circle and centroid on the frame,
  67. # then update the list of tracked points
  68. cv2.circle(frame, (int(x), int(y)), int(radius), (0, 255, 255), 2)
  69. cv2.circle(frame, center, 5, (0, 0, 255), -1)
  70. # update the points queue
  71. pts.appendleft(center)
  72. # loop over the set of tracked points
  73. for i in range(1, len(pts)):
  74. # if either of the tracked points are None, ignore them
  75. if pts[i - 1] is None or pts[i] is None:
  76. continue
  77. # otherwise, compute the thickness of the line and
  78. # draw the connecting lines
  79. thickness = int(np.sqrt(args["buffer"] / float(i + 1)) * 2.5)
  80. cv2.line(frame, pts[i - 1], pts[i], (0, 0, 255), thickness)
  81. # show the frame to our screen
  82. cv2.imshow("Frame", frame)
  83. key = cv2.waitKey(1) & 0xFF
  84. # if the 'q' key is pressed, stop the loop
  85. if key == ord("q"):
  86. break
  87. # if we are not using a video file, stop the camera video stream
  88. if not args.get("video", False):
  89. vs.stop()
  90. # otherwise, release the camera
  91. else:
  92. vs.release()
  93. # close all windows
  94. cv2.destroyAllWindows()

The size of Object(s)

size_of_objects_example_02.gif
pixels per metric ratio:

  1. The reference object should have known dimensions (such as width or height) in terms of a measurable unit (inches, millimeters, etc.).
  2. The reference object should be easy to find, either in terms of location of the object or in its appearance.

Problems:

  • Without a perfect 90-degree view (or as close to it as possible), the dimensions of the objects can appear distorted.
  • Need to calibrate the iPhone using the intrinsic and extrinsic parameters of the camera. Without determining these parameters, photos can be prone to radial and tangential lens distortion.
  1. #!/usr/bin/env python
  2. # encoding: utf-8
  3. # import the necessary packages
  4. from scipy.spatial import distance as dist
  5. from imutils import perspective
  6. from imutils import contours
  7. import numpy as np
  8. import argparse
  9. import imutils
  10. import cv2
  11. def midpoint(ptA, ptB):
  12. return ((ptA[0] + ptB[0]) * 0.5, (ptA[1] + ptB[1]) * 0.5)
  13. # construct the argument parse and parse the arguments
  14. ap = argparse.ArgumentParser()
  15. ap.add_argument("-i", "--image", required=True, help="path to the input image")
  16. ap.add_argument("-w", "--width", type=float, required=True, help="width of the left-most object in the image (in inches)")
  17. args = vars(ap.parse_args())
  18. # load the image, convert it to grayscale, and blur it slightly
  19. image = cv2.imread(args["image"])
  20. gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
  21. gray = cv2.GaussianBlur(gray, (7, 7), 0)
  22. # perform edge detection, then perform a dilation + erosion to
  23. # close gaps in between object edges
  24. edged = cv2.Canny(gray, 50, 100)
  25. edged = cv2.dilate(edged, None, iterations=1)
  26. edged = cv2.erode(edged, None, iterations=1)
  27. # find contours in the edge map
  28. cnts = cv2.findContours(edged.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
  29. cnts = imutils.grab_contours(cnts)
  30. # sort the contours from left-to-right and initialize the
  31. # 'pixels per metric' calibration variable
  32. (cnts, _) = contours.sort_contours(cnts)
  33. pixelsPerMetric = None
  34. # loop over the contours individually
  35. for c in cnts:
  36. # if the contour is not sufficiently large, ignore it
  37. if cv2.contourArea(c) < 100:
  38. continue
  39. # compute the rotated bounding box of the contour
  40. orig = image.copy()
  41. box = cv2.minAreaRect(c)
  42. box = cv2.cv.BoxPoints(box) if imutils.is_cv2() else cv2.boxPoints(box)
  43. box = np.array(box, dtype="int")
  44. # order the points in the contour such that they appear
  45. # in top-left, top-right, bottom-right, and bottom-left
  46. # order, then draw the outline of the rotated bounding
  47. # box
  48. box = perspective.order_points(box)
  49. cv2.drawContours(orig, [box.astype("int")], -1, (0, 255, 0), 2)
  50. # loop over the original points and draw them
  51. for (x, y) in box:
  52. cv2.circle(orig, (int(x), int(y)), 5, (0, 0, 255), -1)
  53. # unpack the ordered bounding box, then compute the midpoint
  54. # between the top-left and top-right coordinates, followed by
  55. # the midpoint between bottom-left and bottom-right coordinates
  56. (tl, tr, br, bl) = box
  57. (tltrX, tltrY) = midpoint(tl, tr)
  58. (blbrX, blbrY) = midpoint(bl, br)
  59. # compute the midpoint between the top-left and top-right points,
  60. # followed by the midpoint between the top-righ and bottom-right
  61. (tlblX, tlblY) = midpoint(tl, bl)
  62. (trbrX, trbrY) = midpoint(tr, br)
  63. # draw the midpoints on the image
  64. cv2.circle(orig, (int(tltrX), int(tltrY)), 5, (255, 0, 0), -1)
  65. cv2.circle(orig, (int(blbrX), int(blbrY)), 5, (255, 0, 0), -1)
  66. cv2.circle(orig, (int(tlblX), int(tlblY)), 5, (255, 0, 0), -1)
  67. cv2.circle(orig, (int(trbrX), int(trbrY)), 5, (255, 0, 0), -1)
  68. # draw lines between the midpoints
  69. cv2.line(orig, (int(tltrX), int(tltrY)), (int(blbrX), int(blbrY)), (255, 0, 255), 2)
  70. cv2.line(orig, (int(tlblX), int(tlblY)), (int(trbrX), int(trbrY)), (255, 0, 255), 2)
  71. # compute the Euclidean distance between the midpoints
  72. dA = dist.euclidean((tltrX, tltrY), (blbrX, blbrY))
  73. dB = dist.euclidean((tlblX, tlblY), (trbrX, trbrY))
  74. # if the pixels per metric has not been initialized, then
  75. # compute it as the ratio of pixels to supplied metric
  76. # (in this case, inches)
  77. if pixelsPerMetric is None:
  78. pixelsPerMetric = dB / args["width"]
  79. # compute the size of the object
  80. dimA = dA / pixelsPerMetric
  81. dimB = dB / pixelsPerMetric
  82. # draw the object sizes on the image
  83. cv2.putText(orig, "{:.1f}in".format(dimA), (int(tltrX - 15), int(tltrY - 10)), cv2.FONT_HERSHEY_SIMPLEX, 0.65, (255, 255, 255), 2)
  84. cv2.putText(orig, "{:.1f}in".format(dimB), (int(trbrX + 10), int(trbrY)), cv2.FONT_HERSHEY_SIMPLEX, 0.65, (255, 255, 255), 2)
  85. # show the output image
  86. cv2.imshow("Image", orig)
  87. cv2.waitKey(0)

Facial Landmarks

image.png
The pre-trained facial landmark detector inside the dlib library is used to estimate the location of 68 (x, y)-coordinates that map to facial structures on the face.

The indexes of the 68 coordinates can be visualized on the image below:
image.png

  1. #!/usr/bin/env python
  2. # encoding: utf-8
  3. # import the necessary packages
  4. from imutils import face_utils
  5. import numpy as np
  6. import argparse
  7. import imutils
  8. import dlib
  9. import cv2
  10. # construct the argument parser and parse the arguments
  11. ap = argparse.ArgumentParser()
  12. ap.add_argument("-p", "--shape-predictor", required=True, help="path to facial landmark predictor")
  13. ap.add_argument("-i", "--image", required=True, help="path to input image")
  14. args = vars(ap.parse_args())
  15. # initialize dlib's face detector (HOG-based) and then create
  16. # the facial landmark predictor
  17. detector = dlib.get_frontal_face_detector()
  18. predictor = dlib.shape_predictor(args["shape_predictor"])
  19. # load the input image, resize it, and convert it to grayscale
  20. image = cv2.imread(args["image"])
  21. image = imutils.resize(image, width=500)
  22. gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
  23. # detect faces in the grayscale image
  24. rects = detector(gray, 1)
  25. # loop over the face detections
  26. for (i, rect) in enumerate(rects):
  27. # determine the facial landmarks for the face region, then
  28. # convert the facial landmark (x, y)-coordinates to a NumPy
  29. # array
  30. shape = predictor(gray, rect)
  31. shape = face_utils.shape_to_np(shape)
  32. # convert dlib's rectangle to a OpenCV-style bounding box
  33. # [i.e., (x, y, w, h)], then draw the face bounding box
  34. (x, y, w, h) = face_utils.rect_to_bb(rect)
  35. cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 0), 2)
  36. # show the face number
  37. cv2.putText(image, "Face #{}".format(i + 1), (x - 10, y - 10),
  38. cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
  39. # loop over the (x, y)-coordinates for the facial landmarks
  40. # and draw them on the image
  41. for (x, y) in shape:
  42. cv2.circle(image, (x, y), 1, (0, 0, 255), -1)
  43. # show the output image with the face detections + facial landmarks
  44. cv2.imshow("Output", image)
  45. cv2.waitKey(0)

Dlib

ml_guide.jpg

Eye Blink

image.png
Object Tracking - 图6

Due to noise in a video stream, subpar facial landmark detections, or fast changes in viewing angle, a simple threshold on the eye aspect ratio could produce a false-positive detection, reporting that a blink had taken place when in reality the person had not blinked.

To make the blink detector more robust to these challenges, it is recommend:

  • Computing the eye aspect ratio for the N-th frame, along with the eye aspect ratios for N – 6 and N + 6 frames, then concatenating these eye aspect ratios to form a 13 dimensional feature vector.
  • Training a Support Vector Machine (SVM) on these feature vectors.

Drowsiness

  1. #!/usr/bin/env python
  2. # encoding: utf-8
  3. # import the necessary packages
  4. from scipy.spatial import distance as dist
  5. from imutils.video import VideoStream
  6. from imutils import face_utils
  7. from threading import Thread
  8. import numpy as np
  9. import playsound
  10. import argparse
  11. import imutils
  12. import time
  13. import dlib
  14. import cv2
  15. import os
  16. from urllib.parse import quote
  17. def sound_alarm(fname):
  18. # play an alarm sound
  19. cwd = os.getcwd()
  20. path = os.path.join(cwd, fname)
  21. playsound.playsound(quote(path))
  22. def eye_aspect_ratio(eye):
  23. # compute the euclidean distances between the two sets of
  24. # vertical eye landmarks (x, y)-coordinates
  25. A = dist.euclidean(eye[1], eye[5])
  26. B = dist.euclidean(eye[2], eye[4])
  27. # compute the euclidean distance between the horizontal
  28. # eye landmark (x, y)-coordinates
  29. C = dist.euclidean(eye[0], eye[3])
  30. # compute the eye aspect ratio
  31. ear = (A + B) / (2.0 * C)
  32. # return the eye aspect ratio
  33. return ear
  34. # construct the argument parse and parse the arguments
  35. ap = argparse.ArgumentParser()
  36. ap.add_argument("-p", "--shape-predictor", default='../facial_landmarks/shape_predictor_68_face_landmarks.dat', required=False, help="path to facial landmark predictor")
  37. ap.add_argument("-a", "--alarm", type=str, default="alarm.mp3", help="path alarm .WAV file")
  38. ap.add_argument("-w", "--webcam", type=int, default=0, help="index of webcam on system")
  39. args = vars(ap.parse_args())
  40. # define two constants, one for the eye aspect ratio to indicate blink
  41. # and then a second constant for the number of consecutive frames
  42. # the eye must be below the threshold for to set off the alarm
  43. EYE_AR_THRESH = 0.18
  44. EYE_AR_CONSEC_FRAMES = 48
  45. # initialize the frame counter as well as a boolean used to
  46. # indicate if the alarm is going off
  47. COUNTER = 0
  48. ALARM_ON = False
  49. # initialize dlib's face detector (HOG-based) and then create
  50. # the facial landmark predictor
  51. print("[INFO] loading facial landmark predictor...")
  52. detector = dlib.get_frontal_face_detector()
  53. predictor = dlib.shape_predictor(args["shape_predictor"])
  54. # grab the indexes of the facial landmarks for the left
  55. # and right eye, respectively
  56. (lStart, lEnd) = face_utils.FACIAL_LANDMARKS_IDXS["left_eye"]
  57. (rStart, rEnd) = face_utils.FACIAL_LANDMARKS_IDXS["right_eye"]
  58. # start the video stream thread
  59. print("[INFO] starting video stream thread...")
  60. vs = VideoStream(src=args["webcam"]).start()
  61. time.sleep(1.0)
  62. # loop over frames from the video stream
  63. while True:
  64. # grab the frame from the threaded video file stream,
  65. # resize it, and convert it to grayscale channels)
  66. frame = vs.read()
  67. frame = imutils.resize(frame, width=450)
  68. gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
  69. # detect faces in the grayscale frame
  70. rects = detector(gray, 0)
  71. # loop over the face detections
  72. for rect in rects:
  73. # determine the facial landmarks for the face region,
  74. # then convert the facial landmark (x, y)-coordinates to
  75. # a NumPy array
  76. shape = predictor(gray, rect)
  77. shape = face_utils.shape_to_np(shape)
  78. # extract the left and right eye coordinates, then use the
  79. # coordinates to compute the eye aspect ratio for both eyes
  80. leftEye = shape[lStart:lEnd]
  81. rightEye = shape[rStart:rEnd]
  82. leftEAR = eye_aspect_ratio(leftEye)
  83. rightEAR = eye_aspect_ratio(rightEye)
  84. # average the eye aspect ratio together for both eyes
  85. ear = (leftEAR + rightEAR) / 2.0
  86. # compute the convex hull for the left and right eye,
  87. # then visualize each of the eyes
  88. leftEyeHull = cv2.convexHull(leftEye)
  89. rightEyeHull = cv2.convexHull(rightEye)
  90. cv2.drawContours(frame, [leftEyeHull], -1, (0, 255, 0), 1)
  91. cv2.drawContours(frame, [rightEyeHull], -1, (0, 255, 0), 1)
  92. # check to see if the eye aspect ratio is below the blink
  93. # threshold, and if so, increment the blink frame counter
  94. if ear < EYE_AR_THRESH:
  95. COUNTER += 1
  96. # if the eyes were closed for a sufficient number of frames
  97. # then sound the alarm
  98. if COUNTER >= EYE_AR_CONSEC_FRAMES:
  99. # if the alarm is not on, turn it on
  100. if not ALARM_ON:
  101. ALARM_ON = True
  102. # check to see if an alarm file was supplied,
  103. # and if so, start a thread to have the alarm
  104. # sound played in the background
  105. if args["alarm"] != "":
  106. t = Thread(target=sound_alarm, args=(args["alarm"],))
  107. t.deamon = True
  108. t.start()
  109. # draw an alarm on the frame
  110. cv2.putText(frame, "DROWSINESS ALERT!", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 2)
  111. # otherwise, the eye aspect ratio is not below the blink
  112. # threshold, so reset the counter and alarm
  113. else:
  114. COUNTER = 0
  115. ALARM_ON = False
  116. # draw the computed eye aspect ratio on the frame to help
  117. # with debugging and setting the correct eye aspect ratio
  118. # thresholds and frame counters
  119. cv2.putText(frame, "EAR: {:.2f}".format(ear), (300, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 2)
  120. # show the frame
  121. cv2.imshow("Frame", frame)
  122. key = cv2.waitKey(1) & 0xFF
  123. # if the `q` key was pressed, break from the loop
  124. if key == ord("q"):
  125. break
  126. # do a bit of cleanup
  127. cv2.destroyAllWindows()
  128. vs.stop()

Neural Network

the feedforward network:
image.png
image.pngimage.png

  1. #!/usr/bin/env python
  2. # encoding: utf-8
  3. # import the necessary packages
  4. from sklearn.preprocessing import LabelEncoder
  5. from sklearn.model_selection import train_test_split
  6. from keras.models import Sequential
  7. from keras.layers import Activation
  8. from keras.optimizers import SGD
  9. from keras.layers import Dense
  10. from keras.utils import np_utils
  11. from imutils import paths
  12. import numpy as np
  13. import argparse
  14. import cv2
  15. import os
  16. def image_to_feature_vector(image, size=(32, 32)):
  17. # resize the image to a fixed size, then flatten the image into
  18. # a list of raw pixel intensities
  19. return cv2.resize(image, size).flatten()
  20. # construct the argument parse and parse the arguments
  21. ap = argparse.ArgumentParser()
  22. ap.add_argument("-d", "--dataset", default='/Users/jfdi/Downloads/dogs-vs-cats/train', help="path to input dataset")
  23. ap.add_argument("-m", "--model", default='neural_network.hdf5', help="path to output model file")
  24. args = vars(ap.parse_args())
  25. # grab the list of images that we'll be describing
  26. print("[INFO] describing images...")
  27. imagePaths = list(paths.list_images(args["dataset"]))
  28. # initialize the data matrix and labels list
  29. data = []
  30. labels = []
  31. # loop over the input images
  32. for (i, imagePath) in enumerate(imagePaths, 1):
  33. # load the image and extract the class label (assuming that our
  34. # path as the format: /path/to/dataset/{class}.{image_num}.jpg
  35. image = cv2.imread(imagePath)
  36. label = imagePath.split(os.path.sep)[-1].split(".")[0]
  37. # construct a feature vector raw pixel intensities, then update
  38. # the data matrix and labels list
  39. features = image_to_feature_vector(image)
  40. data.append(features)
  41. labels.append(label)
  42. # show an update every 1,000 images
  43. if i > 0 and i % 1000 == 0:
  44. print("[INFO] processed {}/{}".format(i, len(imagePaths)))
  45. # encode the labels, converting them from strings to integers
  46. le = LabelEncoder()
  47. labels = le.fit_transform(labels)
  48. # scale the input image pixels to the range [0, 1], then transform
  49. # the labels into vectors in the range [0, num_classes] -- this
  50. # generates a vector for each label where the index of the label
  51. # is set to `1` and all other entries to `0`
  52. data = np.array(data) / 255.0
  53. labels = np_utils.to_categorical(labels, 2)
  54. # partition the data into training and testing splits, using 75%
  55. # of the data for training and the remaining 25% for testing
  56. print("[INFO] constructing training/testing split...")
  57. (trainData, testData, trainLabels, testLabels) = train_test_split(data, labels, test_size=0.25, random_state=42)
  58. # define the architecture of the network
  59. model = Sequential()
  60. model.add(Dense(768, input_dim=3072, kernel_initializer="uniform", activation="relu"))
  61. model.add(Dense(384, activation="relu", kernel_initializer="uniform"))
  62. model.add(Dense(2))
  63. model.add(Activation("softmax"))
  64. # train the model using SGD
  65. print("[INFO] compiling model...")
  66. sgd = SGD(lr=0.01)
  67. model.compile(loss="binary_crossentropy", optimizer=sgd, metrics=["accuracy"])
  68. model.fit(trainData, trainLabels, epochs=50, batch_size=128, verbose=1)
  69. # show the accuracy on the testing set
  70. print("[INFO] evaluating on testing set...")
  71. (loss, accuracy) = model.evaluate(testData, testLabels, batch_size=128, verbose=1)
  72. print("[INFO] loss={:.4f}, accuracy: {:.4f}%".format(loss, accuracy * 100))
  73. # dump the network architecture and weights to file
  74. print("[INFO] dumping architecture and weights to file...")
  75. model.save(args["model"])

Deep Learning

  1. Load a model from disk.
  2. Pre-process an input image.
  3. Pass the image through the network and obtain the output classifications. ```python

    load images

    cv2.dnn.blobFromImage cv2.dnn.blobFromImages

import models from various frameworks

cv2.dnn.createCaffeImporter cv2.dnn.createTensorFlowImporter cv2.dnn.createTorchImporter

load a serialized model from disk directly

cv2.dnn.readNetFromCaffe cv2.dnn.readNetFromTensorFlow cv2.dnn.readNetFromTorch cv2.dnn.readhTorchBlob

  1. ![image.png](https://cdn.nlark.com/yuque/0/2020/png/268154/1608705973810-ba733f79-3d43-43e9-8127-b140456f9518.png#align=left&display=inline&height=662&margin=%5Bobject%20Object%5D&name=image.png&originHeight=1324&originWidth=960&size=1579373&status=done&style=none&width=480)
  2. ```python
  3. #!/usr/bin/env python
  4. # encoding: utf-8
  5. # import the necessary packages
  6. import numpy as np
  7. import argparse
  8. import time
  9. import cv2
  10. def cv2waitKeyQ():
  11. while True:
  12. if (cv2.waitKey(0) & 0xFF) == ord('q'):
  13. break
  14. pass
  15. cv2.destroyAllWindows()
  16. pass
  17. # construct the argument parse and parse the arguments
  18. ap = argparse.ArgumentParser()
  19. ap.add_argument("-i", "--image", default='test.png', help="path to input image")
  20. ap.add_argument("-p", "--prototxt", default='bvlc_googlenet.prototxt', help="path to Caffe 'deploy' prototxt file")
  21. ap.add_argument("-m", "--model", default='bvlc_googlenet.caffemodel', help="path to Caffe pre-trained model")
  22. ap.add_argument("-l", "--labels", default='synset_words.txt', help="path to ImageNet labels (i.e., syn-sets)")
  23. args = vars(ap.parse_args())
  24. # load the input image from disk
  25. image = cv2.imread(args["image"])
  26. # load the class labels from disk
  27. rows = open(args["labels"]).read().strip().split("\n")
  28. classes = [r[r.find(" ") + 1:].split(",")[0] for r in rows]
  29. # our CNN requires fixed spatial dimensions for our input image(s)
  30. # so we need to ensure it is resized to 224x224 pixels while
  31. # performing mean subtraction (104, 117, 123) to normalize the input;
  32. # after executing this command our "blob" now has the shape:
  33. # (1, 3, 224, 224)
  34. blob = cv2.dnn.blobFromImage(image, 1, (224, 224), (104, 117, 123))
  35. # load our serialized model from disk
  36. print("[INFO] loading model...")
  37. net = cv2.dnn.readNetFromCaffe(args["prototxt"], args["model"])
  38. # set the blob as input to the network and perform a forward-pass to
  39. # obtain our output classification
  40. net.setInput(blob)
  41. start = time.time()
  42. preds = net.forward()
  43. end = time.time()
  44. print("[INFO] classification took {:.5} seconds".format(end - start))
  45. # sort the indexes of the probabilities in descending order (higher
  46. # probabilitiy first) and grab the top-5 predictions
  47. idxs = np.argsort(preds[0])[::-1][:5]
  48. # loop over the top-5 predictions and display them
  49. for (i, idx) in enumerate(idxs):
  50. # draw the top prediction on the input image
  51. if i == 0:
  52. text = "Label: {}, {:.2f}%".format(classes[idx], preds[0][idx] * 100)
  53. cv2.putText(image, text, (15, 35), cv2.FONT_HERSHEY_SIMPLEX, 1., (0, 0, 255), 2)
  54. # display the predicted label + associated probability
  55. # to the console
  56. print("[INFO] {}. label: {}, probability: {:.5}".format(i + 1, classes[idx], preds[0][idx]))
  57. # display the output image
  58. cv2.imshow("Image", image)
  59. cv2waitKeyQ()

Image Hashing

image.png
image.png

  1. #!/usr/bin/env python
  2. # encoding: utf-8
  3. # import the necessary packages
  4. from imutils import paths
  5. import argparse
  6. import time
  7. import sys
  8. import cv2
  9. import os
  10. def dhash(image, hashSize=8):
  11. if image is None:
  12. return None
  13. image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
  14. # resize the input image, adding a single column (width) so we
  15. # can compute the horizontal gradient
  16. resized = cv2.resize(image, (hashSize + 1, hashSize))
  17. # compute the (relative) horizontal gradient between adjacent column pixels
  18. diff = resized[:, 1:] > resized[:, :-1]
  19. print(''.join([str(int(x)) for x in diff.flatten()]))
  20. # convert the difference image to a hash
  21. return sum([2 ** i for (i, v) in enumerate(diff.flatten()) if v])
  22. ap = argparse.ArgumentParser()
  23. ap.add_argument("-i", "--image", default='test.png', help="image to do difference hashing")
  24. args = vars(ap.parse_args())
  25. image = cv2.imread(args['image'])
  26. dhash = dhash(image)
  27. print(dhash)
  28. text = str(dhash)
  29. cv2.putText(image, text, (15, 35), cv2.FONT_HERSHEY_SIMPLEX, 1., (0, 0, 255), 2)
  30. cv2.imshow('image', image)
  31. cv2.waitKey()
  32. cv2.destroyAllWindows()

CNNs

VGGNet-like architectures are characterized by:

  1. Using only 3×3 convolutional layers stacked on top of each other in increasing depth
  2. Reducing volume size by max pooling
  3. Fully-connected layers at the end of the network prior to a softmax classifier

image.png

  1. #!/usr/bin/env python
  2. # encoding: utf-8
  3. # set the matplotlib backend so figures can be saved in the background
  4. import matplotlib
  5. matplotlib.use("Agg")
  6. # import the necessary packages
  7. from tensorflow.keras.preprocessing.image import ImageDataGenerator
  8. from tensorflow.keras.optimizers import Adam
  9. from tensorflow.keras.preprocessing.image import img_to_array
  10. from sklearn.preprocessing import LabelBinarizer
  11. from sklearn.model_selection import train_test_split
  12. from smallervggnet import SmallerVGGNet
  13. import matplotlib.pyplot as plt
  14. from imutils import paths
  15. import numpy as np
  16. import argparse
  17. import random
  18. import pickle
  19. import cv2
  20. import os
  21. from tensorflow.python.client import device_lib
  22. def device_vailable_gpus():
  23. local_device_protos = device_lib.list_local_devices()
  24. print(local_device_protos)
  25. return [x.name for x in local_device_protos if x.device_type == 'GPU']
  26. print(device_vailable_gpus())
  27. # construct the argument parse and parse the arguments
  28. ap = argparse.ArgumentParser()
  29. ap.add_argument("-d", "--dataset", default='/Users/jfdi/Downloads/dogs-vs-cats/train', help="path to input dataset (i.e., directory of images)")
  30. ap.add_argument("-m", "--model", default='catsdogs.model', help="path to output model")
  31. ap.add_argument("-l", "--labelbin", default='lb.pickle', help="path to output label binarizer")
  32. ap.add_argument("-p", "--plot", type=str, default="plot.png", help="path to output accuracy/loss plot")
  33. args = vars(ap.parse_args())
  34. # initialize the number of epochs to train for, initial learning rate,
  35. # batch size, and image dimensions
  36. EPOCHS = 100
  37. INIT_LR = 1e-3
  38. BS = 32
  39. IMAGE_DIMS = (96, 96, 3)
  40. # initialize the data and labels
  41. data = []
  42. labels = []
  43. # grab the image paths and randomly shuffle them
  44. print("[INFO] loading images...")
  45. imagePaths = sorted(list(paths.list_images(args["dataset"])))
  46. random.seed(42)
  47. random.shuffle(imagePaths)
  48. imagePaths = imagePaths[:1000]
  49. # loop over the input images
  50. for (i, imagePath) in enumerate(imagePaths, 1):
  51. # load the image, pre-process it, and store it in the data list
  52. image = cv2.imread(imagePath)
  53. image = cv2.resize(image, (IMAGE_DIMS[1], IMAGE_DIMS[0]))
  54. image = img_to_array(image)
  55. data.append(image)
  56. # extract the class label from the image path and update the
  57. # labels list
  58. #label = imagePath.split(os.path.sep)[-2]
  59. label = (imagePath.split(os.path.sep)[-1]).split('.')[0]
  60. #label = label*2 if random.randint(1, 10) > 5 else label
  61. labels.append(label)
  62. # show an update every 1,000 images
  63. if i > 0 and i % 100 == 0:
  64. print("[INFO] processed {}/{}".format(i, len(imagePaths)))
  65. # scale the raw pixel intensities to the range [0, 1]
  66. data = np.array(data, dtype="float") / 255.0
  67. labels = np.array(labels)
  68. print("[INFO] data matrix: {:.2f}MB".format(data.nbytes / (1024 * 1000.0)))
  69. # binarize the labels
  70. lb = LabelBinarizer()
  71. labels = lb.fit_transform(labels)
  72. print(lb.classes_)
  73. # partition the data into training and testing splits using 80% of
  74. # the data for training and the remaining 20% for testing
  75. (trainX, testX, trainY, testY) = train_test_split(data, labels, test_size=0.2, random_state=42)
  76. # construct the image generator for data augmentation
  77. aug = ImageDataGenerator(rotation_range=25, width_shift_range=0.1, height_shift_range=0.1, shear_range=0.2, zoom_range=0.2, horizontal_flip=True, fill_mode="nearest")
  78. # initialize the model
  79. print("[INFO] compiling model...")
  80. model = SmallerVGGNet.build(width=IMAGE_DIMS[1], height=IMAGE_DIMS[0], depth=IMAGE_DIMS[2], classes=len(lb.classes_))
  81. opt = Adam(lr=INIT_LR, decay=INIT_LR / EPOCHS)
  82. model.compile(loss="SparseCategoricalCrossentropy", optimizer=opt, metrics=["accuracy"])
  83. # binary_crossentropy, categorical_crossentropy
  84. # train the network
  85. print("[INFO] training network...")
  86. H = model.fit(x=aug.flow(trainX, trainY, batch_size=BS), validation_data=(testX, testY), steps_per_epoch=len(trainX) // BS, epochs=EPOCHS, verbose=1)
  87. # save the model to disk
  88. print("[INFO] serializing network...")
  89. model.save(args["model"], save_format="h5")
  90. # save the label binarizer to disk
  91. print("[INFO] serializing label binarizer...")
  92. f = open(args["labelbin"], "wb")
  93. f.write(pickle.dumps(lb))
  94. f.close()
  95. # plot the training loss and accuracy
  96. plt.style.use("ggplot")
  97. plt.figure()
  98. N = EPOCHS
  99. plt.plot(np.arange(0, N), H.history["loss"], label="train_loss")
  100. plt.plot(np.arange(0, N), H.history["val_loss"], label="val_loss")
  101. plt.plot(np.arange(0, N), H.history["accuracy"], label="train_acc")
  102. plt.plot(np.arange(0, N), H.history["val_accuracy"], label="val_acc")
  103. plt.title("Training Loss and Accuracy")
  104. plt.xlabel("Epoch #")
  105. plt.ylabel("Loss/Accuracy")
  106. plt.legend(loc="upper left")
  107. plt.savefig(args["plot"])

Object Detection

image.png

  1. #!/usr/bin/env python
  2. # encoding: utf-8
  3. # import the necessary packages
  4. from imutils.video import VideoStream
  5. from imutils.video import FPS
  6. import numpy as np
  7. import argparse
  8. import imutils
  9. import time
  10. import cv2
  11. # construct the argument parse and parse the arguments
  12. ap = argparse.ArgumentParser()
  13. ap.add_argument("-p", "--prototxt", default='MobileNetSSD_deploy.prototxt', help="path to Caffe 'deploy' prototxt file")
  14. ap.add_argument("-m", "--model", default='MobileNetSSD_deploy.caffemodel', help="path to Caffe pre-trained model")
  15. ap.add_argument("-c", "--confidence", type=float, default=0.2, help="minimum probability to filter weak detections")
  16. args = vars(ap.parse_args())
  17. # initialize the list of class labels MobileNet SSD was trained to
  18. # detect, then generate a set of bounding box colors for each class
  19. CLASSES = ["background", "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor"]
  20. COLORS = np.random.uniform(0, 255, size=(len(CLASSES), 3))
  21. # load our serialized model from disk
  22. print("[INFO] loading model...")
  23. net = cv2.dnn.readNetFromCaffe(args["prototxt"], args["model"])
  24. # initialize the video stream, allow the cammera sensor to warmup,
  25. # and initialize the FPS counter
  26. print("[INFO] starting video stream...")
  27. vs = VideoStream(src=0).start()
  28. time.sleep(2.0)
  29. fps = FPS().start()
  30. # loop over the frames from the video stream
  31. while True:
  32. # grab the frame from the threaded video stream
  33. # and resize it to have a maximum width of 400 pixels
  34. frame = vs.read()
  35. frame = imutils.resize(frame, width=800)
  36. # grab the frame dimensions and convert it to a blob
  37. (h, w) = frame.shape[:2]
  38. blob = cv2.dnn.blobFromImage(cv2.resize(frame, (300, 300)), 0.007843, (300, 300), 127.5)
  39. # pass the blob through the network and obtain the detections
  40. # and predictions
  41. net.setInput(blob)
  42. detections = net.forward()
  43. # loop over the detections
  44. for i in np.arange(0, detections.shape[2]):
  45. # extract the confidence (i.e., probability) associated with
  46. # the prediction
  47. confidence = detections[0, 0, i, 2]
  48. # filter out weak detections by ensuring the `confidence` is
  49. # greater than the minimum confidence
  50. if confidence > args["confidence"]:
  51. # extract the index of the class label from the
  52. # `detections`, then compute the (x, y)-coordinates of
  53. # the bounding box for the object
  54. idx = int(detections[0, 0, i, 1])
  55. box = detections[0, 0, i, 3:7] * np.array([w, h, w, h])
  56. (startX, startY, endX, endY) = box.astype("int")
  57. # draw the prediction on the frame
  58. label = "{}: {:.2f}%".format(CLASSES[idx], confidence * 100)
  59. cv2.rectangle(frame, (startX, startY), (endX, endY), COLORS[idx], 2)
  60. y = startY - 15 if startY - 15 > 15 else startY + 15
  61. cv2.putText(frame, label, (startX, y), cv2.FONT_HERSHEY_SIMPLEX, 0.5, COLORS[idx], 2)
  62. # show the output frame
  63. cv2.imshow("Frame", frame)
  64. key = cv2.waitKey(1) & 0xFF
  65. # if the `q` key was pressed, break from the loop
  66. if key == ord("q"):
  67. break
  68. # update the FPS counter
  69. fps.update()
  70. # stop the timer and display FPS information
  71. fps.stop()
  72. print("[INFO] elapsed time: {:.2f}".format(fps.elapsed()))
  73. print("[INFO] approx. FPS: {:.2f}".format(fps.fps()))
  74. # do a bit of cleanup
  75. cv2.destroyAllWindows()
  76. vs.stop()