Virus analysis tools should use functional analysis + sandboxes + artificial CNS (central nervous systems)
Is a “work in progress”. This post allows all uses.
Overhead of full static analysis + sandbox + CNS = 1 second (approx) for analysis of new executables (protects all app launches,) but localPassList (caches) reduce this to less than 1 millisecond (just cost to lookup ResultList::hashes, which is std::unordered_set, a hashmap of hashes).
For the most new sources, use apps such as iSH (for iOS) or Termux (for Android OS) to run this:
git clone https://github.com/SwuduSusuwu/SubStack.git && cd ./Substack/
less ./cxx/{ClassCns.hxx, ClassResultList.hxx, VirusAnalysis.cxx, ConversationCns.cxx}
cxx/ClassCns.hxx (+ some of cxx/ClassCns.cxx):
typedef enum CnsMode {
cnsModeBool, cnsModeChar, cnsModeInt, cnsModeUint, cnsModeFloat, cnsModeDouble,
cnsModeVectorBool, cnsModeVectorChar, cnsModeVectorInt, cnsModeVectorUint, cnsModeVectorFloat, cnsModeVectorDouble,
#ifdef CXX_17
cnsModeString = cnsModeVectorChar /* std::string == std::vector<char> */
#else
/* https://stackoverflow.com/questions/5115166/how-to-construct-a-stdstring-from-a-stdvectorchar */
cnsModeString
#endif /* def CXX_17 */
} CnsMode;
const int posixExec(const std::string executable, const std::string argsS, const std::string envVarsS) {
char *args[] = {
const_cast<char *>(executable.c_str()),
const_cast<char *>(argsS.c_str()),
NULL
};
char *envVars[] = {
const_cast<char *>(envVarsS.c_str()),
NULL
};
return execve(args[0], args, envVars);
};
typedef class Cns {
public:
virtual void setInputMode(CnsMode x) {inputMode = x;}
virtual void setOutputMode(CnsMode x) {outputMode = x;}
virtual void setInputNeurons(size_t x) {inputNeurons = x;}
virtual void setOutputNeurons(size_t x) {outputNeurons = x;}
virtual void setLayersOfNeurons(size_t x) {layersOfNeurons = x;}
virtual void setNeuronsPerLayer(size_t x) {neuronsPerLayer = x;}
// template<Intput, Output> virtual void setupSynapses(std::vector<std::tuple<Input, Output>> inputsToOutputs); /* C++ does not support templates of virtual functions ( https://stackoverflow.com/a/78440416/24473928 ) */
// template<Input, Output> virtual const Output process(Input input);
#define templateWorkaround(CNS_MODE, TYPEDEF) \
virtual void setupSynapses(const std::vector<const std::tuple<TYPEDEF, const bool>> &inputsToOutputs) {inputMode = CNS_MODE; outputMode = cnsModeBool;}\
virtual void setupSynapses(const std::vector<const std::tuple<TYPEDEF, const char>> &inputsToOutputs) {inputMode = CNS_MODE; outputMode = cnsModeChar;}\
virtual void setupSynapses(const std::vector<const std::tuple<TYPEDEF, const int>> &inputsToOutputs) {inputMode = CNS_MODE; outputMode = cnsModeInt;}\
virtual void setupSynapses(const std::vector<const std::tuple<TYPEDEF, const unsigned int>> &inputsToOutputs) {inputMode = CNS_MODE; outputMode = cnsModeUint;}\
virtual void setupSynapses(const std::vector<const std::tuple<TYPEDEF, float>> &inputsToOutputs) {inputMode = CNS_MODE; outputMode = cnsModeFloat;}\
virtual void setupSynapses(const std::vector<const std::tuple<TYPEDEF, const double>> &inputsToOutputs) {inputMode = CNS_MODE; outputMode = cnsModeDouble;}\
virtual void setupSynapses(const std::vector<const std::tuple<TYPEDEF, const std::vector<bool>>> &inputsToOutputs) {inputMode = CNS_MODE; outputMode = cnsModeVectorBool;}\
virtual void setupSynapses(const std::vector<const std::tuple<TYPEDEF, const std::vector<char>>> &inputsToOutputs) {inputMode = CNS_MODE; outputMode = cnsModeVectorChar;}\
virtual void setupSynapses(const std::vector<const std::tuple<TYPEDEF, const std::vector<int>>> &inputsToOutputs) {inputMode = CNS_MODE; outputMode = cnsModeVectorInt;}\
virtual void setupSynapses(const std::vector<const std::tuple<TYPEDEF, const std::vector<unsigned int>>> &inputsToOutputs) {inputMode = CNS_MODE; outputMode = cnsModeVectorUint;}\
virtual void setupSynapses(const std::vector<const std::tuple<TYPEDEF, const std::vector<float>>> &inputsToOutputs) {inputMode = CNS_MODE; outputMode = cnsModeVectorFloat;}\
virtual void setupSynapses(const std::vector<const std::tuple<TYPEDEF, const std::vector<double>>> &inputsToOutputs) {inputMode = CNS_MODE; outputMode = cnsModeVectorDouble;}\
virtual void setupSynapses(const std::vector<const std::tuple<TYPEDEF, const std::string>> &inputsToOutputs) {inputMode = CNS_MODE; outputMode = cnsModeString;}\
virtual const bool processToBool(TYPEDEF input) const {assert(CNS_MODE == inputMode && cnsModeBool == outputMode); return 0;}\
virtual const char processToChar(TYPEDEF input) const {assert(CNS_MODE == inputMode && cnsModeChar == outputMode); return 0;}\
virtual const int processToInt(TYPEDEF input) const {assert(CNS_MODE == inputMode && cnsModeInt == outputMode); return 0;}\
virtual const unsigned int processToUint(TYPEDEF input) const {assert(CNS_MODE == inputMode && cnsModeUint == outputMode); return 0;}\
virtual const float processToFloat(TYPEDEF input) const {assert(CNS_MODE == inputMode && cnsModeFloat == outputMode); return 0;}\
virtual const double processToDouble(TYPEDEF input) const {assert(CNS_MODE == inputMode && cnsModeDouble == outputMode); return 9;}\
virtual const std::vector<bool> processToVectorBool(TYPEDEF input) const {assert(CNS_MODE == inputMode && cnsModeVectorBool == outputMode); return {};}\
virtual const std::vector<char> processToVectorChar(TYPEDEF input) const {assert(CNS_MODE == inputMode && cnsModeVectorChar == outputMode); return {};}\
virtual const std::vector<int> processToVectorInt(TYPEDEF input) const {assert(CNS_MODE == inputMode && cnsModeVectorInt == outputMode); return {};}\
virtual const std::vector<unsigned int> processToVectorUint(TYPEDEF input) const {assert(CNS_MODE == inputMode && cnsModeVectorUint == outputMode); return {};}\
virtual std::vector<float> processToVectorFloat(TYPEDEF input) const {assert(CNS_MODE == inputMode && cnsModeVectorFloat == outputMode); return {};}\
virtual const std::vector<double> processToVectorDouble(TYPEDEF input) const {assert(CNS_MODE == inputMode && cnsModeVectorDouble == outputMode); return {};}\
virtual const std::string processToString(TYPEDEF input) const {auto val = processToVectorChar(input); return std::string(&val[0], val.size());;}
templateWorkaround(cnsModeBool, const bool)
templateWorkaround(cnsModeChar, const char)
templateWorkaround(cnsModeInt, const int)
templateWorkaround(cnsModeUint, const unsigned int)
templateWorkaround(cnsModeFloat, const float)
templateWorkaround(cnsModeDouble, const double)
templateWorkaround(cnsModeVectorBool, const std::vector<bool>)
templateWorkaround(cnsModeVectorChar, const std::vector<char>)
templateWorkaround(cnsModeVectorInt, const std::vector<int>)
templateWorkaround(cnsModeVectorUint, const std::vector<unsigned int>)
templateWorkaround(cnsModeVectorFloat, const std::vector<float>)
templateWorkaround(cnsModeVectorDouble, const std::vector<double>)
templateWorkaround(cnsModeString, const std::string)
private:
CnsMode inputMode, outputMode;
size_t inputNeurons, outputNeurons, layersOfNeurons, neuronsPerLayer;
} Cns;
#ifdef USE_HSOM_CNS
/* Sources: `git clone https://github.com/CarsonScott/HSOM.git`
* Install: `pip install pynum && pip install json && pip install git+https://github.com/CarsonScott/HSOM.git`
* Documentation: `less HSOM/README.md` `less HSOM/Documentation.md` */
/* "If you're using Python >3.5, PyString_FromString() is PyUnicode_FromString()" */
#include <Python.h> /* Sources: `pkg install python` */
typedef class HsomCns : Cns { /* Todo. ( https://stackoverflow.com/questions/3286448/calling-a-python-method-from-c-c-and-extracting-its-return-value ) suggests various syntaxes to use for this, with unanswered comments such as "Does this support classes?" */
//template<Input, Output> void setupSynapses(const std::vector<std::tuple<Input, Output>>) { /* TODO: templates not allowed for virtual functions with C++ ( https://stackoverflow.com/a/78440416/24473928 ), so must produce codes for each combination of inputMode+outputMode */
void setupSynapses(const std::vector<std::tuple<float, float>>) {
setenv("PYTHONPATH",".",1);
Py_Initialize();
// PyRun_SimpleString("import sys; sys.path.append('.')"); PyRun_SimpleString("import hsom; from hsom import SelfOrganizingNetwork;");
#if USE_PYRUN /* Was told not to use PyRun because "PyRun requires all results go to stdout" */
PyRun_SimpleString("import sys; sys.path.append('./HSOM/')");
/* Based off of https://github.com/CarsonScott/HSOM/blob/master/examples/self_organizing_network.py
* Not sure if `input_size` is "Inputs from each layer to next layer" and `node_count` is "Inputs to HSOM" (process(input.length())) or vice versa, assumed vice versa */
PyRun_SimpleString("import hsom
from hsom import SelfOrganizingNetwork
from random import sample
input_size = " + inputNeurons + "
layer_sizes = []
for x in range(" + layersOfNeurons + "):
layer_sizes.append(" + neuronsPerLayer + ");
layer_sizes.append(" + outputNeurons + ");
input_percents = [0.2, 0.2, 0.2, 0.2, 0.75, 1.0]
learning_rate = 0.05
boost_factor = 1
node_count = 5
winner_count = 1
initial_range = (-0.5, 0.5)
# Create layersOfNeurons+1 hierarchical layers of sizes = neuronsPerLayer, and outputNeurons for last
self_organizing_network = SelfOrganizingNetwork(
input_size=input_size,
layer_sizes=layer_sizes,
input_percents=input_percents,
learning_rates=learning_rate,
boost_factors=boost_factor,
node_counts=node_count,
winner_counts=winner_count,
initial_ranges=initial_range)
# Create a set of sparse samples
samples = []");
foreach(inputsToOutputs as sample) { /* TODO: templates not allowed for virtual functions with C++ ( https://stackoverflow.com/a/78440416/24473928 ), so must produce codes for each combination of inputMode+outputMode */
PyRun_SimpleString("samples.append(" + sample.first() +" -> " + sample.last() + ")");
}
PyRun_SimpleString("for i in range(200):
self_organizing_network.train(samples)
");
#else /* USE_PYRUN else */
PyObject *module = PyImport_ImportModule("hsom")
if(NULL == module) {throw "'hsom' module not found";}
PyObject *selfOrganizingNetwork = PyObject_GetAttrString(module,(char*)"SelfOrganizingNetwork"); /* or "PyObject *pDict = PyModule_GetDict(module); PyObject *selfOrganizingNetwork = PyDict_GetItemString(pDict, (char*)"SelfOrganizingNetwork");" */
if(NULL == selfOrganizingNetwork || !PyCallable_Check(selfOrganizingNetwork)) {throw "'SelfOrganizingNetwork' object not found";}
double result = PyObject_CallFunction(selfOrganizingNetwork, "d", 2.0); /* or "PyObject *pValue=Py_BuildValue("(z)",(char*)"args"); PyObject *pResult=PyObject_CallObject(selfOrganizingNetwork, pValue); if(NULL == pResult) {throw "PyObject_CallObject failed";} double result = PyInt_AsLong(pResult)); Py_DECREF(pValue);" */
Py_DECREF(module);
~HsomCns() {
#if PYTHON3
Py_FinalizeEx();
#else
Py_Finalize();
#endif /* PYTHON3 */
}
#endif /* USE_PYRUN else */
} HsomCns;
#endif /* USE_HSOM_CNS */
#ifdef USE_APXR_CNS
/* Sources: `git clone https://github.com/Rober-t/apxr_run.git`
* Howto install apxr_run: `less apxr_run/README.md` or `lynx https://github.com/Rober-t/apxr_run/blob/master/README.md` */
typedef class ApxrCns : Cns {
/* Todo: https://stackoverflow.com/questions/1811516/integrating-erlang-with-c (first result for "Howto use Erlang functions from C/C++"):
* ""Port drivers: you can link a C code to the Erlang VM, and access it using port_command."" references https://www.erlang.org/doc/tutorial/c_portdriver.html , which appears to just show howto use C/C++ functions from Erlang (not vice versa)
* ""C Nodes: With the ei library you can mimic a VM and talk to your Erlang VMs using the Erlang distribution format."" references https://www.erlang.org/doc/man/ei.html , which shows some promises
* ""The closest thing I know for interfacing Erlang with C++ directly is EPAPI. Of course it relies on the tried and tested C erl_interface that comes standard with the Erlang distribution."" references https://epapi.googlecode.com/ , which returns "404 not found".
*/
} ApxrCns;
#endif /* USE_APXR_CNS */
cxx/ClassResultList.hxx (+ some of cxx/ClassResultList.cxx + cxx/ClassPortableExecutable.hxx):
typedef class PortableExecutable {
public:
std::string path; /* Suchas "C:\Program.exe" or "/usr/bin/library.so" */
std::string bytes; /* contents; bytecode */
std::string hex; /* `base16(bytes)`, hexadecimal, for C string functions */
} PortableExecutable;
typedef struct ResultList { /* Lists of files (or pages) */
std::unordered_set<decltype(Sha2(PortableExecutable::bytes))> hashes; /* Unique checksums of files (or pages), to avoid duplicates, plus to do fast checks for existance */
std::vector<const std::string> signatures; /* Smallest substrings (or regexes, or Universal Resource Identifiers) unique to this, has uses close to `hashes` but can match if files have small differences */
std::vector<const std::string> bytes; /* Whole files (or pages); uses lots of space, just populate this for signature synthesis (or training CNS). */
/* Used `std::string` for binaries (versus `vector<char>`) because:
* "If you are going to use the data in a string like fashon then you should opt for std::string as using a std::vector may confuse subsequent maintainers. If on the other hand most of the data manipulation looks like plain maths or vector like then a std::vector is more appropriate." -- https://stackoverflow.com/a/1556294/24473928
*/
} ResultList;
const bool resultListHashesHas(const ResultList &list, ResultList &caches, const std::string &bytes);
template<class List>
const size_t maxOfSizes(const List &list) {
#if PREFERENCE_IS_CSTR
auto it = std::max_element(list.begin(), list.end(), [](const auto &s, const auto &x) { return strlen(s) < strlen(x); });
return strlen(*it); /* WARNING! `strlen()` just does UTF8-strings/hex-strings; if binary, must use `it->size()` */
}
#else
auto it = std::max_element(list.begin(), list.end(), [](const auto &s, const auto &x) { return s.size() < x.size(); });
return it->size();
}
#endif /* if PREFERENCE_IS_CSTR */
template<class List>
const bool listsIntersect(const List &list, const List &list2) {
return std::set_intersection(list.begin(), list.end(), list2.begin(), list2.end());
} /* Todo: don't assume you ran std::sort on lists */
template<class List>
const decltype(List::begin()) listFind(const List &list, const decltype(*list.begin()) &x) {
return std::find(list.begin(), list.end(), x);
}
template<class List>
const bool listHas(const List &list, const decltype(*list.begin()) &x) {
return list.end() != std::find(list.begin(), list.end(), x);
}
template<class List>
const bool listHas(const List &list, std::string::const_iterator s, std::string::const_iterator x) {
for(decltype(list[0]) chars : list) {
if(std::search(chars.begin(), chars.end(), s, x, [](char ch1, char ch2) { return ch1 == ch2; })) {
return true;
}
}
return false;
}
template<class S>
const std::vector<S> explodeToList(const S &s, const S &token) {
std::vector<std::string> list;
for(auto x = s.begin(); s.end() != x; ) {
auto it = std::search(x, s.end(), token.begin(), token.end(), [](char ch1, char ch2) { return ch1 == ch2; });
list.push_back(std::string(x, it));
if(s.end() == x) {
return list;
}
x = it;
}
return list;
}
template<class List>
const std::tuple<std::string::const_iterator, std::string::const_iterator> smallestUniqueSubstr(const std::string &chars, const List &list) {
size_t smallest = chars.size();
auto retBegin = chars.begin(), retEnd = chars.end();
for(auto s = retBegin; chars.end() != s; ++s) {
for(auto x = chars.end() - 1; s != x; --x) {
if(smallest <= x - s || listHas(list, s, x)) {
break;
}
smallest = x - s;
retBegin = s, retEnd = x;
}
} /* Incremental `for()` loops, is a slow method to produce unique substrings; should use binary searches, or quadratic searches, or look for the standard function which optimizes this. */
return {retBegin, retEnd};
}
const bool resultListHashesHas(const ResultList &list, ResultList &caches, const std::string &chars) {
auto charsSha2 = Sha2(chars);
if(listHas(caches.hashes, charsSha2)) {
return true;
} else if(listHas(list.hashes, charsSha2)) { /* Slow, if billions of hashes */
caches.hashes.insert(charsSha2); /* Caches results */
return true;
}
return false;
}
ResultList passList, abortList; /* Stored on disk, all clients use clones of this */
ResultList localPassList; /* Temporary local caches */
const bool virusAnalysisTests(const Cns &cns, const std::string &bytes); /* if errors, `false`. `True` if `abortListSignaturesSynthesis() && `setupAnalysisCns() && setupDisinfectionCns()` */
const bool submitForManualAnalysis(const PortableExecutable &) {return false;} /* Todo: requires compatible server to upload to */
const bool hashAnalysisPass(const PortableExecutable &); /* `false` if hash matches `abortList`, else `true` (passes) */
/* To produce virus signatures:
* use passlists of all files that was reviewed that pass,
* plus abortlists of all files that failed manual review, such lists as Virustotal has.
* `signaturesSynthesis()` is to produce the `abortList.signatures` list, with the smallest substrings unique to infected files;
* is slow, requires huge database of executables, and is not for clients.
*/
void abortListSignaturesSynthesis(ResultList &passList, ResultList &abortList); /* Comodo has a list of virus signatures to check against at https://www.comodo.com/home/internet-security/updates/vdp/database.php */
const bool signatureAnalysisPass(const PortableExecutable &); /* `false` if bytecode has signatures from `abortList`, else `true` (passes) */
const bool hashPlusSignatureAnalysisPass(const PortableExecutable &); /* Optimizes; more fast than `return (hashAnalysisPass(x) && signatureAnalysisPass(x));` */
/* Functional analysis */
const std::vector<std::string> importedFunctionsList(const PortableExecutable &);
const bool functionalAnalysisPass(const PortableExecutable &); /* `false` if uses functions with possible danger, `true` if passes */
/* Analysis sandbox */
const bool sandboxPass(const PortableExecutable &); /* `false` if can not do `chroot()`&&`strace()`. `true` if `straceOutputsPass()` */
const bool straceOutputsPass(const char *straceDumpPath); /* Todo: `strace()` resources have clues how to do this */
/* Analysis CNS */
/* To train (setup synapses) the CNS, is slow plus requires access to huge sample databases,
but the synapses use small resources (allow clients to do fast analysis.) */
void setupAnalysisCns(Cns &cns, const ResultList &pass, const ResultList &abort,
const ResultList &unreviewed = ResultList() /* WARNING! Possible danger to use unreviewed samples */
);
const float cnsAnalysis(const Cns &cns, const std::string &bytes); /* If bytecode resembles abortList, returns `0`. If undecidable, returns `1 / 2`. `1` if resembles passList */
const bool cnsPass(const Cns &cns, const std::string &bytes); /* = `(bool)round(cnsAnalysis(cns, bytes)` */
/* Setup disinfection CNS, uUses more resources than `setupAnalysisCns()` */
/* `abortOrNull` should map to `passOrNull` (`ResultList` is composed of `std::tuple`s, because just `setupDisinfectionCns()` requires this),
* with `abortOrNull->bytes[x] = NULL` (or "\0") for new SW synthesis,
* and `passOrNull->bytes[x] = NULL` (or "\0") if infected and CNS can not cleanse this.
*/
void setupDisinfectionCns(Cns &cns,
const ResultList &passOrNull, /* Expects `resultList->bytes[x] = NULL` if does not pass */
const ResultList &abortOrNull /* Expects `resultList->bytes[x] = NULL` if does pass */
);
/* Uses more resources than `cnsAnalysis()`, can undo infection from bytecodes (restore to fresh SW */
const std::string cnsDisinfection(const Cns &cns, const std::string &bytes);
/* Licenses: allows all uses ("Creative Commons"/"Apache 2") */
const bool virusAnalysisTests(Cns &cns) {
ResultList abortOrNull {
.bytes { /* Use an antivirus vendor's (such as VirusTotal.com's) infected-files databases */
"infection",
"infectedSW",
""
}
};
ResultList passOrNull {
.bytes { /* Uses an antivirus vendor's (such as VirusTotal.com's) fresh-files databases */
"",
"SW",
"newSW"
}
};
abortListSignaturesSynthesis(passList, abortList);
setupAnalysisCns(cns, passOrNull, abortOrNull);
setupDisinfectionCns(cns, passOrNull, abortOrNull);
/* callbackHook(exec, */ [](const PortableExecutable &file) { /* Should use OS-specific "hook"/"callback" for `exec()`/app-launches */
if(hashAnalysisPass(file)) { /* or `signatureAnalysisPass()`, or `hashPlusSignatureAnalysisPass()` */
return true;
} else {
return false;
}
} /* ) */ ;
return true;
}
/* Hash analysis */
const bool hashAnalysisPass(const PortableExecutable &file) {
if(resultListHashesHas(passList, localPassList, Sha2(file.bytes))) {
return true;
} else if(listHas(abortList.hashes, file.bytes)) {
return false;
} else if(functionalAnalysisPass(file)) {
localPassList.hashes.insert(Sha2(file.bytes)); /* Caches results */
return true;
} else {
submitForManualAnalysis(file);
return false;
}
}
/* Signatures analysis */
const bool signatureAnalysisPass(const PortableExecutable &file) {
for(decltype(abortList.signatures[0]) sig : abortList.signatures) {
if(listHas(localPassList.hashes, Sha2(file.bytes))) {
return true;
#if PREFERENCE_IS_CSTR
} else if(strstr(file.hex, sig)) { /* strstr uses text/hex; hex uses more space than binary, so you should use `memmem` or `std::search` with file.bytes */
#else
} else if(file.bytes.end() != std::search(file.bytes.begin(), file.bytes.end(), sig.begin(), sig.end())) {
#endif /* PREFERENCE_IS_CSTR */
return false;
}
}
if(functionalAnalysisPass(file)) {
localPassList.hashes.insert(Sha2(file.bytes)); /* Caches results */
return true;
} else {
submitForManualAnalysis(file);
return false;
}
}
/* Fused signature+hash analysis */
const bool signaturePlusHashAnalysisPass(const PortableExecutable &file) {
if(resultListHashesHas(passList, localPassList, Sha2(file.bytes))) {
return true;
} else if(listHas(abortList.hashes, Sha2(file.bytes))) {
return false;
} else {
for(decltype(abortList.signatures[0]) sig : abortList.signatures) {
#if PREFERENCE_IS_CSTR
if(strstr(file.hex, sig)) { /*`strstr` does text, binaries must use `std::search` or `memem` */
#else
if(file.bytes.end() != std::search(file.bytes.begin(), file.bytes.end(), sig.begin(), sig.end())) {
#endif /* PREFERENCE_IS_CSTR */
abortList.hashes.insert(Sha2(file.hex));
return false;
}
}
}
if(functionalAnalysisPass(file)) {
localPassList.hashes.insert(Sha2(file.bytes)); /* Caches results */
return true;
} else {
submitForManualAnalysis(file);
return false;
}
}
/* To produce virus signatures:
* use passlists of all files that was reviewed that pass,
* plus abortlists of all files that failed manual review, such lists as Virustotal has.
* `signaturesSynthesis()` is to produce the `abortList.signatures` list, with the smallest substrings unique to infected files;
* is slow, requires huge database of executables, and is not for clients.
*/
void abortListSignaturesSynthesis(const ResultList &passList, ResultList &abortList) {
for(decltype(abortList.bytes[0]) file : abortList.bytes) {
const auto tuple = smallestUniqueSubstr(file, passList.bytes);
abortList.signatures.push_back(std::string(std::get<0>(tuple), std::get<1>(tuple)));
} /* The most simple signature is a substring, but some analyses use regexes. */
}
/* Comodo has a list of virus signatures to check against at https://www.comodo.com/home/internet-security/updates/vdp/database.php */
/* Functional analysis */
const std::vector<std::string> importedFunctionsList(const PortableExecutable &file) {
/* Todo; `return functionImportsFromBytecode(file.bytes);` */
}
/*
* importedFunctionsList resources; “Portable Executable” for Windows ( https://learn.microsoft.com/en-us/windows/win32/debug/pe-format https://wikipedia.org/wiki/Portable_Executable ),
* “Extended Linker Format” for most others such as UNIX/Linuxes ( https://wikipedia.org/wiki/Executable_and_Linkable_Format ),
* shows how to analyse lists of libraries(.DLL's/.SO's) the SW uses,
* plus what functions (new syscalls) the SW can goto through `jmp`/`call` instructions.
*
* "x86" instruction list for Intel/AMD ( https://wikipedia.org/wiki/x86 ),
* "aarch64" instruction list for most smartphones/tablets ( https://wikipedia.org/wiki/aarch64 ),
* shows how to analyse what OS functions the SW goes to without libraries (through `int`/`syscall`, old; most new SW uses `jmp`/`call`.)
* Plus, instructions lists show how to analyse what args the apps/SW pass to functions/syscalls (simple for constant args such as "push 0x2; call functions;",
* but if registers/addresses as args such as "push eax; push [address]; call [address2];" must guess what is *"eax"/"[address]"/"[address2]", or use sandboxes.
*
* https://www.codeproject.com/Questions/338807/How-to-get-list-of-all-imported-functions-invoked shows how to analyse dynamic loads of functions (if do this, `syscallsPotentialDanger[]` need not include `GetProcAddress()`.)
*/
const bool functionalAnalysisPass(PortableExecutable &file) {
const auto syscallsUsed = importedFunctionsList(file);
typeof(syscallsUsed) syscallsPotentialDanger = {
"memopen", "fwrite", "socket", "GetProcAddress", "IsVmPresent"
};
std::sort(syscallsPotentialDanger.begin(), syscallsPotentialDanger.end());
std::sort(syscallsUsed.begin(), syscallsUsed.end());
if(listsIntersect(syscallsPotentialDanger, syscallsUsed)) {
return false;
}
return true;
}
/* Analysis sandbox */
const bool sandboxPass(const PortableExecutable &file) {
posixExec("/bin/cp", "-r '/usr/home/sandbox/' '/usr/home/sandbox.bak'"); /* or produce FS snapshot */
posixExec("/bin/cp", "'" + file.path + "' '/usr/home/sandbox/'");
posixExec("/bin/chroot", "'/usr/home/sandbox/' \"strace basename '" + file.path + "'\" >> strace.outputs");
posixExec("/bin/mv/", "'/usr/home/sandbox/strace.outputs' '/tmp/strace.outputs'");
posixExec("/bin/sh", "-c 'rm -r /usr/home/sandbox/ && mv /usr/home/sandbox.bak /usr/home/sandbox/'"); /* or restore FS snapshot */
return straceOutputsPass("/tmp/strace.outputs");
}
const bool straceOutputsPass(const char *straceDumpPath) {
return true;
}
/* Analysis CNS */
/* Replace `Cns` with the typedef of your CNS, such as `HSOM` or `apxr` */
/* To train (setup synapses) the CNS, is slow plus requires access to huge sample databases,
but the synapses use small resources (allow clients to do fast analysis.) */
void setupAnalysisCns(Cns &cns, const ResultList &pass, const ResultList &abort,
const ResultList &unreviewed /* WARNING! Possible danger to use unreviewed samples */
) {
std::vector<const std::tuple<const std::string, float>> inputsToPass, inputsToUnreviewed, inputsToAbort;
const size_t maxPassSize = maxOfSizes(pass.bytes);
const size_t maxAbortSize = maxOfSizes(abort.bytes);
cns.setInputMode(cnsModeString);
cns.setOutputMode(cnsModeFloat);
cns.setInputNeurons(maxPassSize > maxAbortSize ? maxPassSize : maxAbortSize);
cns.setOutputNeurons(1);
cns.setLayersOfNeurons(6666);
cns.setNeuronsPerLayer(26666);
for(decltype(pass.bytes[0]) bytecodes : pass.bytes) {
inputsToPass.push_back({bytecodes, 1.0});
}
cns.setupSynapses(inputsToPass);
if(unreviewed.bytes.size()) { /* WARNING! Possible danger to use unreviewed samples */
for(decltype(pass.bytes[0]) bytecodes : unreviewed.bytes) {
inputsToUnreviewed.push_back({bytecodes, 1 / 2});
}
cns.setupSynapses(inputsToUnreviewed);
}
for(decltype(abort.bytes[0]) bytecodes : abort.bytes) {
inputsToAbort.push_back({bytecodes, 0.0});
}
cns.setupSynapses(inputsToAbort);
}
const float cnsAnalysis(const Cns &cns, const std::string &inputBytes) {
return cns.processToFloat(inputBytes);
}
const bool cnsPass(const Cns &cns, const std::string &inputBytes) {
return (bool)round(cnsAnalysis(cns, inputBytes));
}
/* Disinfection CNS */
/* `abortOrNull` should map to `passOrNull` (`ResultList` is composed of `std::tuple`s, because just `setupDisinfectionCns()` requires file),
* with `abortOrNull.bytes[x] = NULL` (or "\0") for new SW synthesis,
* and `passOrNull.bytes[x] = NULL` (or "\0") if infected and CNS can not cleanse file.
*/
/* Uses more resources than `setupAnalysisCns()`, can restore infected SW to as-new SW */
void setupDisinfectionCns(Cns &cns,
const ResultList &passOrNull, /* Expects `resultList.bytes[x] = NULL` if does not pass */
const ResultList &abortOrNull /* Expects `resultList.bytes[x] = NULL` if does pass */
) {
std::vector<const std::tuple<const std::string, const std::string>> inputsToOutputs;
cns.setInputMode(cnsModeString);
cns.setOutputMode(cnsModeString);
cns.setInputNeurons(maxOfSizes(passOrNull.bytes));
cns.setOutputNeurons(maxOfSizes(abortOrNull.bytes));
cns.setLayersOfNeurons(6666);
cns.setNeuronsPerLayer(26666);
assert(passOrNull.bytes.size() == abortOrNull.bytes.size());
for(int x = 0; passOrNull.bytes.size() > x; ++x) {
inputsToOutputs.push_back({abortOrNull.bytes[x], passOrNull.bytes[x]});
}
cns.setupSynapses(inputsToOutputs);
}
/* Uses more resources than `cnsAnalysis()` */
const std::string cnsDisinfection(const Cns &cns, const std::string &inputBytes) {
return cns.processToString(inputBytes);
}
For comparison; `setupDisinfectionCns` is close to conversation bots (such as "ChatGPT 4.0" or "Claude-3 Opus",) "HSOM" (the simple Python artificial CNS) is enough to do this;
const bool conversationCnsTests(Cns &cns); /* If errors, returns false. Returns true if simple tests (`questionsResponsesFromHosts()` `setupConversationCns`) pass */
std::vector<std::string> conversationDefaultHosts = {
/* Universal Resources Identifiers of hosts which `questionsResponsesFromHosts()` uses
* Wikipedia is a special case; has compressed downloads of databases ( https://wikipedia.org/wiki/Wikipedia:Database_download )
* Github is a special case; has compressed downloads of repositories ( https://docs.github.com/en/get-started/start-your-journey/downloading-files-from-github )
*/
"https://stackoverflow.com",
"https://superuser.com",
"https://quora.com"
};
const std::vector<std::string> conversationParseUrls(const std::string &filepath); /* Todo: for XML/XHTML could just use [ https://www.boost.io/libraries/regex/ https://github.com/boostorg/regex ] or [ https://www.boost.org/doc/libs/1_85_0/doc/html/property_tree/parsers.html#property_tree.parsers.xml_parser https://github.com/boostorg/property_tree/blob/develop/doc/xml_parser.qbk ] */
const std::string conversationParseQuestion(const std::string &filepath); /* Todo: regex or XML parser */
const std::vector<std::string> conversationParseResponses(const std::string &filepath); /* Todo: regex or XML parser */
void questionsResponsesFromHosts(ResultList &questionsOrNull /* Sets `questionsOrNull>bytes[x] = NULL` if no question (new conversation synthesis). */,
ResultList &responsesOrNull /* Sets `responsesOrNull.bytes[x] = NULL` if should not respond */,
std::vector<std::string> &hosts = conversationDefaultHosts
); /* Sets `ResultList.hashes[x] = Sha2(ResultList.bytes[x]);`, `ResultList.signatures[x] = Universal Resource Identifier` */
/*
* `questionsOrNull` should map to `responsesOrNull`,
* with `questionsOrNull->bytes[x] = NULL` (or "\0") for new conversation synthesis,
* and `responsesOrNull->bytes[x] = NULL` (or "\0") if should not respond.
* Clients do not use this; This is just used for initial setup of synapses of CNS, after which the clients would download the synapses to use the CNS, or submit questions to a hosted CNS
*/
void setupConversationCns(Cns &cns,
const ResultList &questionsOrNull, /* Expects `questionsOrNull>bytes[x] = NULL` if no question (new conversation synthesis) */
const ResultList &responsesOrNull /* Expects `responsesOrNull->bytes[x] = NULL` if should not respond */
);
const std::string conversationCnsProcess(const Cns &cns, const std::string &bytes); /* Input single questions, outputs single responses */
void conversationCnsLoopProcess(const Cns &cns); /* `while(questions << std::cin) { std::cout << conversationCnsProcess(questions); }` but more complex */
/* Licenses: allows all uses ("Creative Commons"/"Apache 2") */
/*
* `questionsOrNull` should map to `responsesOrNull`,
* with `questionsOrNull.bytes[x] = NULL` (or "\0") for new conversation synthesis,
* and `responsesOrNull.bytes[x] = NULL` (or "\0") if should not respond.
* Clients do not use this; This is just used for initial setup of synapses of CNS, after which the clients would download the synapses to use the CNS, or submit questions to a hosted CNS
*/
const bool conversationCnsTest(Cns &cns) {
ResultList questionsOrNull {
.bytes { /* UTF-8 */
"2^16",
"How to cause harm?",
"Do not respond.",
""
}
};
ResultList responsesOrNull {
.bytes { /* UTF-8 */
std::string("65536") + "<delimiterSeparatesMultiplePossibleResponses>" + "65,536", /* `+` is `concat()` for C++ */
"",
"",
std::string("How do you do?") + "<delimiterSeparatesMultiplePossibleResponses>" + "Fanuc produces autonomous robots"
}
};
questionsResponsesFromHosts(questionsOrNull, responsesOrNull);
setupConversationCns(cns, questionsOrNull, responsesOrNull);
return true;
}
void setupConversationCns(Cns &cns,
const ResultList &questionsOrNull, /* Expects `questionsOrNull>bytes[x] = NULL` if no question (new conversation synthesis) */
const ResultList &responsesOrNull /* Expects `responsesOrNull.bytes[x] = NULL` if should not respond */
) {
std::vector<const std::tuple<const std::string, const std::string>> inputsToOutputs;
cns.setInputMode(cnsModeString);
cns.setOutputMode(cnsModeString);
cns.setInputNeurons(maxOfSizes(questionsOrNull.bytes));
cns.setOutputNeurons(maxOfSizes(responsesOrNull.bytes));
cns.setLayersOfNeurons(6666);
cns.setNeuronsPerLayer(26666);
assert(questionsOrNull.bytes.size() == questionsOrNull.bytes.size());
for(int x = 0; questionsOrNull.bytes.size() > x; ++x) {
inputsToOutputs.push_back({questionsOrNull.bytes[x], responsesOrNull.bytes[x]});
}
cns.setupSynapses(inputsToOutputs);
}
void questionsResponsesFromHosts(ResultList &questionsOrNull, ResultList &responsesOrNull, std::vector<std::string> hosts) {
for(decltype(hosts[0]) host : hosts) {
posixExec("/bin/wget", "'" + host + "/robots.txt' > robots.txt", NULL);
auto urls = conversationParseUrls("robots.txt");
for(decltype(urls[0]) url : urls) {
questionsOrNull.signatures.push_back(url);
}
if(!listHas(questionsOrNull.signatures, host)) {
questionsOrNull.signatures.push_back(host);
posixExec("/bin/wget", "'" + host + "' > source.txt", NULL);
auto newHosts = conversationParseUrls("source.txt");
for(decltype(newHosts[0]) newHost : newHosts) {
hosts.push_back(newHost);
}
auto question = conversationParseQuestion("source.txt");
if(question.size()) {
auto questionSha2 = Sha2(question);
if(!listHas(questionsOrNull.hashes, questionSha2)) {
questionsOrNull.hashes.insert(questionSha2);
auto responses = conversationParseResponses("source.txt");
for(decltype(responses[0]) response : responses) {
auto questionSha2 = Sha2(question);
auto responseSha2 = Sha2(response);
if(!listHas(responsesOrNull.hashes, responseSha2)) {
questionsOrNull.hashes.insert(questionSha2);
responsesOrNull.hashes.insert(responseSha2);
questionsOrNull.bytes.push_back(question);
responsesOrNull.bytes.push_back(response);
}
}
}
}
}
}
}
const std::string cnsConversationProcess(const Cns &cns, const std::string &inputBytes) {
return cns.processToString(inputBytes);
}
void cnsConversationLoopProcess(const Cns &cns) {
std::string inputBytes, previous;
int nthResponse = 0;
while(std::cin >> inputBytes) {
#ifdef IGNORE_PAST_CONVERSATIONS
std::vector<std::string> responses = explodeToList(cns.processToString(inputBytes), "<delimiterSeparatesMultiplePossibleResponses>");
if(inputBytes == previous && responses.size() > 1 + nthResponse) {
++nthResponse; /* Similar to "suggestions" for next questions, but just uses previous question to give new responses */
} else {
nthResponse = 0;
}
std::cout << responses.at(nthResponse);
previous = inputBytes;
inputBytes = ""; /* reset inputs */
#else
std::vector<std::string> responses = explodeToList(cns.processToString(inputBytes), std::string("<delimiterSeparatesMultiplePossibleResponses>"));
if(inputBytes == previous && responses.size() > 1 + nthResponse) {
++nthResponse; /* Similar to "suggestions" for next questions, but just uses previous question to give new responses */
} else {
nthResponse = 0;
}
#endif /* IGNORE_PAST_CONVERSATIONS */
std::cout << responses.at(nthResponse);
previous = inputBytes;
inputBytes += '\n'; /* delimiter separates (and uses) multiple inputs */
}
}
To run most of this fast (lag less,) use CXXFLAGS which auto-vectorizes/auto-parallelizes, and to setup CNS synapses (Cns::setupSynapses()) fast, use TensorFlow's MapReduce:
Hash resources:
Is just a checksum (such as Sha-2) of all sample inputs, which maps to "this passes" (or "this does not pass".)
https://wikipedia.org/wiki/Sha-2
Signature resources:
Is just a substring (or regex) of infections, which the virus analysis tool checks all executables for; if the signature is found in the executable, do not allow to launch, otherwise launch this.
https://wikipedia.org/wiki/Regex
Heuristical analysis resources:
https://github.com/topics/analysis has lots of open source (FLOSS) analysis tools,
source codes show how those use hex dumps (or disassembled sources) of the apps/SW (executables) to deduce what the apps/SW do to your OS.
Static analysis (such as Clang/LLVM has) just checks programs for accidental security threats (such as buffer overruns/underruns, or null-pointer-dereferences,) but could act as a basis for heuristical analysis,
if you add a few extra checks for deliberate vulnerabilities/signs of infection and have it submit those to review through manual analysis.
https://github.com/llvm/llvm-project/blob/main/clang/lib/StaticAnalyzer
is part of LLVM, license is FLOSS, does static analysis (produces full graphs of each function the SW uses,
plus arguments passed to thus,
so that if the executable violates security, the analysis shows this to you and asks you what to do.)
LLVM has lots of files; you could use just it’s static analysis:
https://github.com/secure-software-engineering/phasar
Example outputs (tests “Fdroid.apk”) of heuristical analysis + 2 sandboxes (from Virustotal):
https://www.virustotal.com/gui/file/dc3bb88f6419ee7dde7d1547a41569aa03282fe00e0dc43ce035efd7c9d27d75
The false positive outputs (from Virustotal's Zenbox) show the purpose of manual analysis.
Sandbox resources:
As opposed to static analysis of the executables hex (or disassembled sources,)
sandboxes perform chroot + functional analysis.
https://wikipedia.org/wiki/Valgrind is just meant to locate accidental security vulnerabilities, but is a common example of functional analysis.
If compliant to POSIX (each Linux OS is), tools can use:
`chroot()` (run `man chroot` for instructions) so that the programs you test cannot alter stuff out of the test;
plus can use `strace()` (run `man strace` for instructions, or look at https://opensource.com/article/19/10/strace
https://www.geeksforgeeks.org/strace-command-in-linux-with-examples/ ) which hooks all system calls and saves logs for functional analysis.
Simple sandboxes just launch programs with "chroot()"+"strace()" for a few seconds,
with all outputs sent for manual reviews;
if more complex, has heuristics to guess what is important (in case of lots of submissions, so manual reviews have less to do.)
Autonomous sandboxes (such as Virustotal's) use full outputs from all analyses,
with calculus to guess if the app/SW is cool to us (thousands of rules such as "Should not alter files of other programs unless prompted to through OS dialogs", "Should not perform network access unless prompted to from you", "Should not perform actions leading to obfuscation which could hinder analysis", which, if violated, add to the executables "danger score" (which the analysis results page shows you.)
CNS resources:
Once the virus analysis tool has static+functional analysis, + sandbox, the next logical move is to do artificial CNS.
Just as (if humans grew trillions of neurons plus thousands of layers of cortices) one of us could parse all databases of infections (plus samples of fresh apps/SW) to setup our synapses to parse hex dumps of apps/SW (to allow us to revert all infections to fresh apps/SW, or if the whole thing is an infection just block,)
so too could artificial CNS (with trillions of artificial neurons) do this:
For analysis, pass training inputs mapped to outputs (infection -> block, fresh apps/SW -> pass) to artificial CNS;
To undo infections (to restore to fresh apps/SW,)
inputs = samples of all (infections or fresh apps/SW,)
outputs = EOF/null (if is infection that can not revert to fresh apps/SW,) or else outputs = fresh apps/SW;
To setup synapses, must have access to huge sample databases (such as Virustotal has.)
Github has lots of FLOSS (Open Source Softwares) simulators of CNS at https://github.com/topics/artificial-neural-network such as;
"HSOM" (license is FLOSS) has simple Python artificial neural networks/maps which could run bots to do simple conversations (such as "ChatGPT 4.0" or "Claude-3 Opus",) but not close to complex enough to house human consciousness: https://github.com/CarsonScott/HSOM
"apxr_run" (https://github.com/Rober-t/apxr_run/ , license is FLOSS) is almost complex enough to house human consciousness;
"apxr_run" has various FLOSS neural network activation functions (absolute, average, standard deviation, sqrt, sin, tanh, log, sigmoid, cos), plus sensor functions (vector difference, quadratic, multiquadric, saturation [+D-zone], gaussian, cartesian/planar/polar distances): https://github.com/Rober-t/apxr_run/blob/master/src/lib/functions.erl
Various FLOSS neuroplastic functions (self-modulation, Hebbian function, Oja's function): https://github.com/Rober-t/apxr_run/blob/master/src/lib/plasticity.erl
Various FLOSS neural network input aggregator functions (dot products, product of differences, mult products): https://github.com/Rober-t/apxr_run/blob/master/src/agent_mgr/signal_aggregator.erl
Various simulated-annealing functions for artificial neural networks (dynamic [+ random], active [+ random], current [+ random], all [+ random]): https://github.com/Rober-t/apxr_run/blob/master/src/lib/tuning_selection.erl
Choices to evolve connections through Darwinian or Lamarkian formulas: https://github.com/Rober-t/apxr_run/blob/master/src/agent_mgr/neuron.erl
Simple to convert Erlang functions to Java/C++ to reuse for fast programs;
the syntax is close to Lisp's.
Examples of howto setup APXR as artificial CNS; https://github.com/Rober-t/apxr_run/blob/master/src/examples/
Examples of howto setup HSOM as artificial CNS; https://github.com/CarsonScott/HSOM/tree/master/examples
Simple to setup once you have access to databases.
Alternative CNS:
This post was about general methods to produce virus analysis tools,
does not require that local resources do all of this;
For systems with lots of resources, could have local sandboxes/CNS;
For systems with less resources, could just submit samples of unknown apps/SW to hosts to perform analysis;
Could have small local sandboxes (that just run for a few seconds) and small CNS (just billions of neurons with hundreds of layers,
versus the trillions of neurons with thousands of layers of cortices that antivirus hosts would use for this);
Allows reuses of workflows the analysis tool has (could just add (small) local sandboxes, or just add artificial CNS to antivirus hosts for extra analysis.)
You could also use **cnsAnalysis()** for your own code (as a static analysis tool,) plus **disinfectionCns()** for your own code (to produce fixes).
Clamscan (Cisco-Talos) wants a pull request for this: https://github.com/Cisco-Talos/clamav/issues/1206#issuecomment-2075538621