This is my report on the assignment 1.

I am going to run 4 different algorithms of volume control and compare their speed.

Let's start with analyzing each file. Note: copyright: Chris Tyler 2017.11.29-2021.11.16 - Licensed under GPLv3. Modified by: Loran

Main value of this report in in answers to inline questions. Some of them made me think a lot.

vol.h


#define SAMPLES 400000000 // L: this sample size allows runtime for about 20s
#define VOLUME 50.0 // Percent of original volume
void vol_createsample(int16_t* sample, int32_t sample_count);

vol0.c

naive scaling. Simply multiply by the factor. (Floating point math, slow) Comments also have some of my answers for assignment questions.


#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <stdbool.h>
#include <time.h>
#include "vol.h"

// >>L: main algorithm function
int16_t scale_sample(int16_t sample, int volume) {

        return (int16_t) ((float) (volume/100.0) * (float) sample);
}

int main() {
        int             x;
        int             ttl=0;

// ---- Create in[] and out[] arrays
        int16_t*        in;
        int16_t*        out;
        in=(int16_t*) calloc(SAMPLES, sizeof(int16_t));
        out=(int16_t*) calloc(SAMPLES, sizeof(int16_t));

// ---- Create dummy samples in in[]
        vol_createsample(in, SAMPLES);

// ---- This is the part we're interested in!
// ---- Scale the samples from in[], placing results in out[]

        for (x = 0; x < SAMPLES; x++) {
                out[x]=scale_sample(in[x], VOLUME);
        }

// ---- This part sums the samples. (Why is this needed?)
//>>>>L : This is basically a check-sum, with the naive algorithm we don't loose any precision, so we can compare output of other 
        for (x = 0; x < SAMPLES; x++) {
                ttl=(ttl+out[x])%1000;
        }

// ---- Print the sum of the samples. (Why is this needed?)
// >>>>>L:output the check-sum for user convenience
        printf("Result: %d\n", ttl);

        return 0;
}

vol1.c Fixed-point volume scaling algorithm

(will be showing only diff)

int16_t scale_sample(int16_t sample, int volume) {

        return ((((int32_t) sample) * ((int32_t) (32767 * volume / 100) <<1) ) >> 16);
// >>> L:  we  utilize FPU (Floating Point Unit) vs ALU (Arithmetic Logic Unit) used in vol0.c. ALU is slower, but using FPU we are going to loose some precision. As you can see, here we are using int_32, bigger integer to accommodate for loss of precision, as well to create some space for bit shift operations.
}

vol2.c - precalculated volume scaling table.

(look-up table) To further improve this code, we could have made table at compile time


        precalc = (int16_t*) calloc(65536,2);
        if (precalc == NULL) {
                printf("malloc failed!\n");
                return 1;
        }

        for (x = -32768; x <= 32767; x++) {
// Q: What is the purpose of the cast to unint16_t in the next line?
                precalc[(uint16_t) x] = (int16_t) ((float) x * VOLUME / 100.0);
// >>> L: x is int, not a uint_16t. This carries from previous versions of programs, so probably that's why it is int. Also we need to store negative values in x. (loop starts from -32768. But wen we fill out table, we don't want to use negative indexes (counting from end to start). So we cast to unsigned int. What remains unclear to me, why do we want negative values at all. My guess is sound is sinusoidal wave, so it can have negative values.

        }

        for (x = 0; x < SAMPLES; x++) {
                out[x]=precalc[(uint16_t) in[x]];
        }

vol3.c Dummy volume scaling function

// Q: What's the point of this dummy program? How does it help // with benchmarking?

// L: this is all of the overhead, we are going to subtract this runtime from other function run time to calculate the difference

#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <stdbool.h>
#include "vol.h"

int16_t scale_sample(int16_t sample, int volume) {

        return (int16_t) 100;
}

int main() {
        int             x;
        int             ttl=0;


        int16_t*        in;
        int16_t*        out;
        in=(int16_t*) calloc(SAMPLES, sizeof(int16_t));
        out=(int16_t*) calloc(SAMPLES, sizeof(int16_t));

        vol_createsample(in, SAMPLES);

        for (x = 0; x < SAMPLES; x++) {
                out[x]=scale_sample(in[x], VOLUME);
        }

        for (x = 0; x < SAMPLES; x++) {
                ttl=(ttl+out[x])%1000;
        }

        printf("Result: %d\n", ttl);
        return 0;
}

vol4.c volume scaling in C using AArch64 SIMD inline assembler

int main() {

#ifndef __aarch64__
        printf("Wrong architecture - written for aarch64 only.\n");
#else


        // these variables will also be accessed by our assembler code
        int16_t*        in_cursor;              // input cursor
        int16_t*        out_cursor;             // output cursor
        int16_t         vol_int;                // volume as int16_t

        int16_t*        limit;                  // end of input array

        int             x;                      // array interator
        int             ttl=0 ;                 // array total

// ---- Create in[] and out[] arrays
        int16_t*        in;
        int16_t*        out;
        in=(int16_t*) calloc(SAMPLES, sizeof(int16_t));
        out=(int16_t*) calloc(SAMPLES, sizeof(int16_t));

// ---- Create dummy samples in in[]
        vol_createsample(in, SAMPLES);

// ---- This is the part we're interested in!
// ---- Scale the samples from in[], placing results in out[]


        // set vol_int to fixed-point representation of the volume factor
        // Q: should we use 32767 or 32768 in next line? why?
        // >>>L: 32767, as it is bigest positive int16_t value, aka 0x7FFF
        vol_int = (int16_t)(VOLUME/100.0 * 32767.0);

// Q: what is the purpose of these next two lines?
// >>>L: name of the array, is a pointer to it's first element. It will be incremented later.
// >>> can use them for pointer arithmetic without changing values of original
        in_cursor = in;
        out_cursor = out;
        limit = in + SAMPLES;

// Q: what does it mean to "duplicate" values in the next line?
//  >>>> A duplicate of the value is stored in a vector which will act as an array of equal size. The value to duplicate is %w0 and the duplicate value will be sent to the dupv1.8h.  [(c)qzhang125]
        __asm__ ("dup v1.8h,%w0"::"r"(vol_int)); // duplicate vol_int into v1.8h

        while ( in_cursor < limit ) {
                __asm__ (
                        "ldr q0, [%[in_cursor]], #16    \n\t"
                        // load eight samples into q0 (same as v0.8h)
                        // from [in_cursor]
                        // post-increment in_cursor by 16 bytes
                        // ans store back into the pointer register

                        "sqrdmulh v0.8h, v0.8h, v1.8h   \n\t"
                        // with 32 signed integer output,
                        // multiply each lane in v0 * v1 * 2
                        // saturate results
                        // store upper 16 bits of results into
                        // the corresponding lane in v0

                        "str q0, [%[out_cursor]],#16            \n\t"
                        // store eight samples to [out_cursor]
                        // post-increment out_cursor by 16 bytes
                        // and store back into the pointer register

// Q: What do these next three lines do?
// >>>> These three lines are used to get the value from the input cursor and output cursor then store them into the system memory. [qzhang125]
                        : [in_cursor]"+r"(in_cursor), [out_cursor]"+r"(out_cursor)
                        : "r"(in_cursor),"r"(out_cursor)
                        : "memory"
                        );
        }

// --------------------------------------------------------------------

        for (x = 0; x < SAMPLES; x++) {
                ttl=(ttl+out[x])%1000;
        }

        // Q: are the results usable? are they correct?
        // >>> E: should be correct checksum result. UPD: Slightly off, as it is still integer devision
        printf("Result: %d\n", ttl);

        return 0;

#endif
}

2 answers are from blog qzhang125

vlol5, internal c++ lib, architecture specific

#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#ifdef __aarch64__
#include <arm_neon.h>
#endif
#include "vol.h"

int main() {

#ifndef __aarch64__
        printf("Wrong architecture - written for aarch64 only.\n");
#else

        register int16_t*       in_cursor       asm("r20");     // input cursor (pointer)
        register int16_t*       out_cursor      asm("r21");     // output cursor (pointer)
        register int16_t        vol_int         asm("r22");     // volume as int16_t

        int16_t*                limit;          // end of input array

        int                     x;              // array interator
        int                     ttl=0;          // array total

// ---- Create in[] and out[] arrays
        int16_t*        in;
        int16_t*        out;
        in=(int16_t*) calloc(SAMPLES, sizeof(int16_t));
        out=(int16_t*) calloc(SAMPLES, sizeof(int16_t));

// ---- Create dummy samples in in[]
        vol_createsample(in, SAMPLES);

// ---- This is the part we're interested in!
// ---- Scale the samples from in[], placing results in out[]

        vol_int = (int16_t) (VOLUME/100.0 * 32767.0);

        in_cursor = in;
        out_cursor = out;
        limit = in + SAMPLES ;

        while ( in_cursor < limit ) {
                // What do these intrinsic functions do?
                // (See gcc intrinsics documentation)
                 // >>>L: those  are c++ functions, that do the same as _asl_ in example above.
                 // >>>L: architecture specific vectorization
                vst1q_s16(out_cursor, vqrdmulhq_s16(vld1q_s16(in_cursor), vdupq_n_s16(vol_int)));

                // Q: Why is the increment below 8 instead of 16 or some other value?
                // >>>L: size of the vector register
                // Q: Why is this line not needed in the inline assembler version
                // of this program?
                 // L: _asl_ in example above does it automatically (intrinsically
                 //L: it was also described in comment: "post-increment in_cursor by 16 bytes"
                in_cursor += 8;
                out_cursor += 8;
        }

// --------------------------------------------------------------------

        for (x = 0; x < SAMPLES; x++) {
                ttl=(ttl+out[x])%1000;

        }

        // Q: Are the results usable? Are they accurate?
        //Q:  Yes. UPD: corrected to NO after seeing results
        printf("Result: %d\n", ttl);

        return 0;
#endif
}

My predictions are: (smallest to biggest run-time:)

vol4: because accembly vectorization
vol5: c++ version of accembly vectorization
vol2: (lookup is 3rd only because table calculated at run-time, not at compile-time)
vol1: integer division.
vol0

Results on Aarch.

If will have enough time, will update with x86. I will be using average of 3 runs, and possibly discarding the first one, as it would be one ran on cold cash. I also will be using user data, as we don't care for system calls.

In the results you can see, that run 2 had different checksum. As discussed above. Result 3 has 0 as checksum, as it did not run on any real data. Surprisingly Check sum for vectorisation methods was different from benchmark. I must have overlooked that it in fact uses integers, not floats.

$ time ./vol0; time ./vol0; time ./vol0 ; echo; echo; time ./vol1; time ./vol1; time ./vol1; echo; echo; time ./vol2; time ./vol2; time ./vol2; echo; echo; time ./vol3; time ./vol3; time ./vol3; echo; echo; time ./vol4; time ./vol4; time ./vol4; echo; echo; time ./vol5; time ./vol5; time ./vol5;  echo; echo;
v0 Result: 698
user    0m21.167s
user    0m21.508s
user    0m20.767s
Average: 21.147
Adjusted for overhead: 0.509

v1 Result: 990
user    0m20.768s
user    0m20.656s
user    0m20.671s
Average:20.698
Adjusted: 0.06

v2 Result: 698
user    0m22.130s
user    0m22.458s
user    0m22.168s
Avg:22.252
Overhead is too different to adjust for

V3: Result: 0
user    0m20.587s
user    0m20.794s
user    0m20.533s
Avg:20.638

V4: Result: 681
user    0m20.141s
user    0m20.313s
user    0m20.524s
AVG: 20.329

V5: Result: 681
user    0m20.311s
user    0m20.253s
user    0m20.192s
AVG:20.252

Conclusion:

I fail to reject my prediction, due to high imprecision of measuring method. If I were to do this again, I would time each function separately.**

Appendix:

Raw data with O3 optimization:

 time ./vol0; time ./vol0; time ./vol0 ; echo; echo; time ./vol1; time ./vol1; time ./vol1; echo; echo; time ./vol2; time ./vol2; time ./vol2; echo; echo; time ./vol3; time ./vol3; time ./vol3; echo; echo; time ./vol4; time ./vol4; time ./vol4; echo; echo; time ./vol5; time ./vol5; time ./vol5; time ./vol5; echo; echo;
Result: 698

real    0m22.012s
user    0m20.725s
sys     0m1.268s
Result: 698

real    0m22.012s
user    0m20.936s
sys     0m1.059s
Result: 698

real    0m22.548s
user    0m21.397s
sys     0m1.128s


Result: 990

real    0m21.825s
user    0m20.689s
sys     0m1.118s
Result: 990

real    0m21.845s
user    0m20.759s
sys     0m1.069s
Result: 990

real    0m22.048s
user    0m20.980s
sys     0m1.048s


Result: 698

real    0m26.537s
user    0m25.401s
sys     0m1.108s
Result: 698

real    0m26.523s
user    0m25.265s
sys     0m1.239s
Result: 698

real    0m23.485s
user    0m22.377s
sys     0m1.089s


Result: 0

real    0m23.484s
user    0m22.458s
sys     0m1.009s
Result: 0

real    0m21.547s
user    0m20.412s
sys     0m1.119s
Result: 0

real    0m21.552s
user    0m20.396s
sys     0m1.139s


Result: 681

real    0m21.409s
user    0m20.343s
sys     0m1.048s
Result: 681

real    0m21.413s
user    0m20.327s
sys     0m1.069s
Result: 681

real    0m21.401s
user    0m20.352s
sys     0m1.028s


Result: 681

real    0m21.388s
user    0m20.129s
sys     0m1.238s
Result: 681

real    0m22.542s
user    0m21.515s
sys     0m1.009s
Result: 681

real    0m21.697s
user    0m20.398s
sys     0m1.278s
Result: 681

real    0m21.415s
user    0m20.259s
sys     0m1.139s

Assignment 1: Algorithm Selection for Optimization, Discussion

spo600