# Vectorization exercises Exercises are reused from previous years, kudos (probably) to Filip Štědroňský. - For each of the following programs, write a vectorized version. - A naive solution is provided in .c. Please save your vectorized solution as _vec.c. - Keep the semantics of the program identical. - Use either GCC vector extensions (recommended) or Intel intrinsics. - Use the Makefile provided below to compile three programs for each exercise: - _novec: The provided naive source code compiled with autovectorization disabled. - _autovec: The provided naive source code compile with autovectorization enabled. This demonstrates the quality of the autovectorization pass in GCC/Clang. - _vec: Your hand-vectorized version. - Use hyperfine to benchmark the variants. - It's fine to benchmark on your laptop as long as you remember the caveats. ## sum.c Sum numbers in a large array: #include #include #define CLOBBER(x) __asm__ volatile ("" : "=rm" (x) :) #define USE(x) __asm__ volatile ("" : : "rm" (x)) #define N (256 * 1024 * 1024) __attribute__((aligned(64))) unsigned arr[N]; void f(void) { CLOBBER(arr); unsigned sum = 0; for (size_t i = 0; i < N; i++) sum += arr[i]; USE(sum); } int main(void) { for (size_t i = 0; i < 100; i++) f(); } ## count42.c Count occurrences of 42s: #include #include #define CLOBBER(x) __asm__ volatile ("" : "=rm" (x) :) #define USE(x) __asm__ volatile ("" : : "rm" (x)) #define N (256 * 1024 * 1024) __attribute__((aligned(64))) unsigned arr[N]; void f(void) { CLOBBER(arr); unsigned count = 0; for (size_t i = 0; i < N; i++) if (arr[i] == 42) count++; //printf("count = %u\n", count); USE(count); } int main(void) { arr[2] = arr[24224] = 42; for (size_t i = 0; i < 100; i++) f(); } ## find.c Like count42.c, but find the index of the right-most occurence of 42. ## max.c Find maximum value in a large array: #include #include #include #define CLOBBER(x) __asm__ volatile ("" : "=rm" (x) :) #define USE(x) __asm__ volatile ("" : : "rm" (x)) #define N (256 * 1024 * 1024) __attribute__((aligned(64))) int arr[N]; void f(void) { CLOBBER(arr); int max = INT_MIN; for (size_t i = 0; i < N; i++) if (max < arr[i]) max = arr[i]; //printf("%i\n", max); USE(max); } int main(void) { arr[2] = arr[24224] = 42; for (size_t i = 0; i < 100; i++) f(); } ## maxidx.c Like max.c, but find maximum _and_ the index of its right-most occurence. ## incsum.c (TBD) ## Makefile It's suggested to use the following Makefile to simplify development and testing: .PHONY: all clean CFLAGS = \ -Wall \ -Wextra \ -g \ -Werror \ -std=gnu11 \ -march=native \ -O3 \ BINS = \ sum_novec sum_autovec sum_vec \ [add targets as you go] all: $(BINS) %_novec: %.c Makefile $(CC) $(CFLAGS) -fno-tree-vectorize -o $@ $< %_autovec: %.c Makefile $(CC) $(CFLAGS) -o $@ $< %_vec: %_vec.c Makefile $(CC) $(CFLAGS) -o $@ $< clean: rm -f -- $(BINS)