1) Timing tools

time 命令 测量程序执行时间

image.png

callgrind工具

可以测量程序执行指令的总数量。
image.png
Ir means I refs,表示执行的指令的总数量。
image.png

callgrind_annotate callgrind.out.pid 指令

pid指进程,就是valgrind执行之后最左侧的数字。可以分析程序执行指令的详细信息。

  1. /* File: swap.c
  2. * ------------
  3. * Insertion sort with 3 different possible swap routines, which differ in
  4. * where they store temporary: fixed stack array, var stack array, or alloc/free from heap
  5. */
  6. #include <stdio.h>
  7. #include <stdlib.h>
  8. #include <string.h>
  9. // This version uses a fixed size stack buffer for the temp storage.
  10. void swap_fixedstack(void *a, void *b, size_t sz)
  11. {
  12. char tmp[8];
  13. memcpy(tmp, a, sz);
  14. memcpy(a, b, sz);
  15. memcpy(b, tmp, sz);
  16. }
  17. // This version uses a variable-sized stack buffer for the temp storage.
  18. void swap_varstack(void *a, void *b, size_t sz)
  19. {
  20. char tmp[sz];
  21. memcpy(tmp, a, sz);
  22. memcpy(a, b, sz);
  23. memcpy(b, tmp, sz);
  24. }
  25. // The version uses heap (malloc/free) for the temp storage.
  26. void swap_heap(void *a, void *b, size_t sz)
  27. {
  28. void *tmp = malloc(sz);
  29. memcpy(tmp, a, sz);
  30. memcpy(a, b, sz);
  31. memcpy(b, tmp, sz);
  32. free(tmp);
  33. }
  34. long insertion_sort(long arr[], size_t n, void (*swap)(void *, void *, size_t))
  35. {
  36. long nswaps = 0;
  37. for (long i = 1; i < n; i++)
  38. for (long j = i-1; j >= 0 && arr[j+1] < arr[j]; j--) {
  39. swap(&arr[j+1], &arr[j], sizeof(arr[j]));
  40. nswaps++;
  41. }
  42. return nswaps;
  43. }
  44. void *init_heap(int num)
  45. {
  46. return malloc(num); // to resolve malloc/free, warmup heap
  47. }
  48. static struct {
  49. void (*fn)(void *, void *, size_t);
  50. char *name;
  51. } trials[] =
  52. {{swap_fixedstack, "using fixed-len stack array"},
  53. {swap_varstack, "using variable-len stack array"},
  54. {swap_heap, "in heap"}};
  55. static const int nfns = sizeof(trials)/sizeof(*trials);
  56. int main(int argc, char *argv[])
  57. {
  58. const int nelems = 1415; // ~1 million swaps
  59. long unsorted[nelems], arr[nelems];
  60. free(init_heap(8));
  61. for (int j = 0; j < nelems; j++) unsorted[j] = nelems - j;
  62. printf("This program will insertion sort an array of %d elems.\n", nelems);
  63. printf("When swapping elements, temp could be stored on stack or heap. What are relative costs?\n");
  64. printf("Run under callgrind, then annotate to find out!\n\n");
  65. for(int i = 0; i < nfns; i++) {
  66. memcpy(arr, unsorted, sizeof(unsorted)); // init array in same configuration for each trial
  67. long nswaps = insertion_sort(arr, nelems, trials[i].fn);
  68. printf("Insertion sort (%zu total swaps), swap stores temp %s\n", nswaps, trials[i].name);
  69. }
  70. return 0;
  71. }

image.png
可以看到它测量了每个函数花费的指令数,以及他们所在的文件位置。

fcyc.c 和 fcyc.h 程序时间周期测量程序

资源 页面的 作业和实验 压缩包内。

2) Math: integer, float, and bitwise

image.png
image.png
https://ridiculousfish.com/blog/posts/will-it-optimize.html

3) Copying data

image.png
image.png
image.png
image.png

4) Stack vs heap allocation

image.png
image.png
image.png
image.png
image.png
可以看到在堆上交换数据是最慢的,花费指令最多。