This is my report on the assignment 1.
I am going to run 4 different algorithms of volume control and compare their speed.
Let's start with analyzing each file. Note: copyright: Chris Tyler 2017.11.29-2021.11.16 - Licensed under GPLv3. Modified by: Loran
Main value of this report in in answers to inline questions. Some of them made me think a lot.
vol.h
#define SAMPLES 400000000 // L: this sample size allows runtime for about 20s
#define VOLUME 50.0 // Percent of original volume
void vol_createsample(int16_t* sample, int32_t sample_count);
vol0.c
naive scaling. Simply multiply by the factor. (Floating point math, slow) Comments also have some of my answers for assignment questions.
#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <stdbool.h>
#include <time.h>
#include "vol.h"
// >>L: main algorithm function
int16_t scale_sample(int16_t sample, int volume) {
return (int16_t) ((float) (volume/100.0) * (float) sample);
}
int main() {
int x;
int ttl=0;
// ---- Create in[] and out[] arrays
int16_t* in;
int16_t* out;
in=(int16_t*) calloc(SAMPLES, sizeof(int16_t));
out=(int16_t*) calloc(SAMPLES, sizeof(int16_t));
// ---- Create dummy samples in in[]
vol_createsample(in, SAMPLES);
// ---- This is the part we're interested in!
// ---- Scale the samples from in[], placing results in out[]
for (x = 0; x < SAMPLES; x++) {
out[x]=scale_sample(in[x], VOLUME);
}
// ---- This part sums the samples. (Why is this needed?)
//>>>>L : This is basically a check-sum, with the naive algorithm we don't loose any precision, so we can compare output of other
for (x = 0; x < SAMPLES; x++) {
ttl=(ttl+out[x])%1000;
}
// ---- Print the sum of the samples. (Why is this needed?)
// >>>>>L:output the check-sum for user convenience
printf("Result: %d\n", ttl);
return 0;
}
vol1.c Fixed-point volume scaling algorithm
(will be showing only diff)
int16_t scale_sample(int16_t sample, int volume) {
return ((((int32_t) sample) * ((int32_t) (32767 * volume / 100) <<1) ) >> 16);
// >>> L: we utilize FPU (Floating Point Unit) vs ALU (Arithmetic Logic Unit) used in vol0.c. ALU is slower, but using FPU we are going to loose some precision. As you can see, here we are using int_32, bigger integer to accommodate for loss of precision, as well to create some space for bit shift operations.
}
vol2.c - precalculated volume scaling table.
(look-up table) To further improve this code, we could have made table at compile time
precalc = (int16_t*) calloc(65536,2);
if (precalc == NULL) {
printf("malloc failed!\n");
return 1;
}
for (x = -32768; x <= 32767; x++) {
// Q: What is the purpose of the cast to unint16_t in the next line?
precalc[(uint16_t) x] = (int16_t) ((float) x * VOLUME / 100.0);
// >>> L: x is int, not a uint_16t. This carries from previous versions of programs, so probably that's why it is int. Also we need to store negative values in x. (loop starts from -32768. But wen we fill out table, we don't want to use negative indexes (counting from end to start). So we cast to unsigned int. What remains unclear to me, why do we want negative values at all. My guess is sound is sinusoidal wave, so it can have negative values.
}
for (x = 0; x < SAMPLES; x++) {
out[x]=precalc[(uint16_t) in[x]];
}
vol3.c Dummy volume scaling function
// Q: What's the point of this dummy program? How does it help // with benchmarking?
// L: this is all of the overhead, we are going to subtract this runtime from other function run time to calculate the difference
#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <stdbool.h>
#include "vol.h"
int16_t scale_sample(int16_t sample, int volume) {
return (int16_t) 100;
}
int main() {
int x;
int ttl=0;
int16_t* in;
int16_t* out;
in=(int16_t*) calloc(SAMPLES, sizeof(int16_t));
out=(int16_t*) calloc(SAMPLES, sizeof(int16_t));
vol_createsample(in, SAMPLES);
for (x = 0; x < SAMPLES; x++) {
out[x]=scale_sample(in[x], VOLUME);
}
for (x = 0; x < SAMPLES; x++) {
ttl=(ttl+out[x])%1000;
}
printf("Result: %d\n", ttl);
return 0;
}
vol4.c volume scaling in C using AArch64 SIMD inline assembler
int main() {
#ifndef __aarch64__
printf("Wrong architecture - written for aarch64 only.\n");
#else
// these variables will also be accessed by our assembler code
int16_t* in_cursor; // input cursor
int16_t* out_cursor; // output cursor
int16_t vol_int; // volume as int16_t
int16_t* limit; // end of input array
int x; // array interator
int ttl=0 ; // array total
// ---- Create in[] and out[] arrays
int16_t* in;
int16_t* out;
in=(int16_t*) calloc(SAMPLES, sizeof(int16_t));
out=(int16_t*) calloc(SAMPLES, sizeof(int16_t));
// ---- Create dummy samples in in[]
vol_createsample(in, SAMPLES);
// ---- This is the part we're interested in!
// ---- Scale the samples from in[], placing results in out[]
// set vol_int to fixed-point representation of the volume factor
// Q: should we use 32767 or 32768 in next line? why?
// >>>L: 32767, as it is bigest positive int16_t value, aka 0x7FFF
vol_int = (int16_t)(VOLUME/100.0 * 32767.0);
// Q: what is the purpose of these next two lines?
// >>>L: name of the array, is a pointer to it's first element. It will be incremented later.
// >>> can use them for pointer arithmetic without changing values of original
in_cursor = in;
out_cursor = out;
limit = in + SAMPLES;
// Q: what does it mean to "duplicate" values in the next line?
// >>>> A duplicate of the value is stored in a vector which will act as an array of equal size. The value to duplicate is %w0 and the duplicate value will be sent to the dupv1.8h. [(c)qzhang125]
__asm__ ("dup v1.8h,%w0"::"r"(vol_int)); // duplicate vol_int into v1.8h
while ( in_cursor < limit ) {
__asm__ (
"ldr q0, [%[in_cursor]], #16 \n\t"
// load eight samples into q0 (same as v0.8h)
// from [in_cursor]
// post-increment in_cursor by 16 bytes
// ans store back into the pointer register
"sqrdmulh v0.8h, v0.8h, v1.8h \n\t"
// with 32 signed integer output,
// multiply each lane in v0 * v1 * 2
// saturate results
// store upper 16 bits of results into
// the corresponding lane in v0
"str q0, [%[out_cursor]],#16 \n\t"
// store eight samples to [out_cursor]
// post-increment out_cursor by 16 bytes
// and store back into the pointer register
// Q: What do these next three lines do?
// >>>> These three lines are used to get the value from the input cursor and output cursor then store them into the system memory. [qzhang125]
: [in_cursor]"+r"(in_cursor), [out_cursor]"+r"(out_cursor)
: "r"(in_cursor),"r"(out_cursor)
: "memory"
);
}
// --------------------------------------------------------------------
for (x = 0; x < SAMPLES; x++) {
ttl=(ttl+out[x])%1000;
}
// Q: are the results usable? are they correct?
// >>> E: should be correct checksum result. UPD: Slightly off, as it is still integer devision
printf("Result: %d\n", ttl);
return 0;
#endif
}
2 answers are from blog qzhang125
vlol5, internal c++ lib, architecture specific
#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#ifdef __aarch64__
#include <arm_neon.h>
#endif
#include "vol.h"
int main() {
#ifndef __aarch64__
printf("Wrong architecture - written for aarch64 only.\n");
#else
register int16_t* in_cursor asm("r20"); // input cursor (pointer)
register int16_t* out_cursor asm("r21"); // output cursor (pointer)
register int16_t vol_int asm("r22"); // volume as int16_t
int16_t* limit; // end of input array
int x; // array interator
int ttl=0; // array total
// ---- Create in[] and out[] arrays
int16_t* in;
int16_t* out;
in=(int16_t*) calloc(SAMPLES, sizeof(int16_t));
out=(int16_t*) calloc(SAMPLES, sizeof(int16_t));
// ---- Create dummy samples in in[]
vol_createsample(in, SAMPLES);
// ---- This is the part we're interested in!
// ---- Scale the samples from in[], placing results in out[]
vol_int = (int16_t) (VOLUME/100.0 * 32767.0);
in_cursor = in;
out_cursor = out;
limit = in + SAMPLES ;
while ( in_cursor < limit ) {
// What do these intrinsic functions do?
// (See gcc intrinsics documentation)
// >>>L: those are c++ functions, that do the same as _asl_ in example above.
// >>>L: architecture specific vectorization
vst1q_s16(out_cursor, vqrdmulhq_s16(vld1q_s16(in_cursor), vdupq_n_s16(vol_int)));
// Q: Why is the increment below 8 instead of 16 or some other value?
// >>>L: size of the vector register
// Q: Why is this line not needed in the inline assembler version
// of this program?
// L: _asl_ in example above does it automatically (intrinsically
//L: it was also described in comment: "post-increment in_cursor by 16 bytes"
in_cursor += 8;
out_cursor += 8;
}
// --------------------------------------------------------------------
for (x = 0; x < SAMPLES; x++) {
ttl=(ttl+out[x])%1000;
}
// Q: Are the results usable? Are they accurate?
//Q: Yes. UPD: corrected to NO after seeing results
printf("Result: %d\n", ttl);
return 0;
#endif
}
My predictions are: (smallest to biggest run-time:)
- vol4: because accembly vectorization
- vol5: c++ version of accembly vectorization
- vol2: (lookup is 3rd only because table calculated at run-time, not at compile-time)
- vol1: integer division.
- vol0
Results on Aarch.
If will have enough time, will update with x86. I will be using average of 3 runs, and possibly discarding the first one, as it would be one ran on cold cash. I also will be using user data, as we don't care for system calls.
In the results you can see, that run 2 had different checksum. As discussed above. Result 3 has 0 as checksum, as it did not run on any real data. Surprisingly Check sum for vectorisation methods was different from benchmark. I must have overlooked that it in fact uses integers, not floats.
$ time ./vol0; time ./vol0; time ./vol0 ; echo; echo; time ./vol1; time ./vol1; time ./vol1; echo; echo; time ./vol2; time ./vol2; time ./vol2; echo; echo; time ./vol3; time ./vol3; time ./vol3; echo; echo; time ./vol4; time ./vol4; time ./vol4; echo; echo; time ./vol5; time ./vol5; time ./vol5; echo; echo;
v0 Result: 698
user 0m21.167s
user 0m21.508s
user 0m20.767s
Average: 21.147
Adjusted for overhead: 0.509
v1 Result: 990
user 0m20.768s
user 0m20.656s
user 0m20.671s
Average:20.698
Adjusted: 0.06
v2 Result: 698
user 0m22.130s
user 0m22.458s
user 0m22.168s
Avg:22.252
Overhead is too different to adjust for
V3: Result: 0
user 0m20.587s
user 0m20.794s
user 0m20.533s
Avg:20.638
V4: Result: 681
user 0m20.141s
user 0m20.313s
user 0m20.524s
AVG: 20.329
V5: Result: 681
user 0m20.311s
user 0m20.253s
user 0m20.192s
AVG:20.252
Conclusion:
I fail to reject my prediction, due to high imprecision of measuring method. If I were to do this again, I would time each function separately.**
Appendix:
Raw data with O3 optimization:
time ./vol0; time ./vol0; time ./vol0 ; echo; echo; time ./vol1; time ./vol1; time ./vol1; echo; echo; time ./vol2; time ./vol2; time ./vol2; echo; echo; time ./vol3; time ./vol3; time ./vol3; echo; echo; time ./vol4; time ./vol4; time ./vol4; echo; echo; time ./vol5; time ./vol5; time ./vol5; time ./vol5; echo; echo;
Result: 698
real 0m22.012s
user 0m20.725s
sys 0m1.268s
Result: 698
real 0m22.012s
user 0m20.936s
sys 0m1.059s
Result: 698
real 0m22.548s
user 0m21.397s
sys 0m1.128s
Result: 990
real 0m21.825s
user 0m20.689s
sys 0m1.118s
Result: 990
real 0m21.845s
user 0m20.759s
sys 0m1.069s
Result: 990
real 0m22.048s
user 0m20.980s
sys 0m1.048s
Result: 698
real 0m26.537s
user 0m25.401s
sys 0m1.108s
Result: 698
real 0m26.523s
user 0m25.265s
sys 0m1.239s
Result: 698
real 0m23.485s
user 0m22.377s
sys 0m1.089s
Result: 0
real 0m23.484s
user 0m22.458s
sys 0m1.009s
Result: 0
real 0m21.547s
user 0m20.412s
sys 0m1.119s
Result: 0
real 0m21.552s
user 0m20.396s
sys 0m1.139s
Result: 681
real 0m21.409s
user 0m20.343s
sys 0m1.048s
Result: 681
real 0m21.413s
user 0m20.327s
sys 0m1.069s
Result: 681
real 0m21.401s
user 0m20.352s
sys 0m1.028s
Result: 681
real 0m21.388s
user 0m20.129s
sys 0m1.238s
Result: 681
real 0m22.542s
user 0m21.515s
sys 0m1.009s
Result: 681
real 0m21.697s
user 0m20.398s
sys 0m1.278s
Result: 681
real 0m21.415s
user 0m20.259s
sys 0m1.139s