OpenKE源码 - Base.cpp - 《机器学习》

#include "Setting.h"
#include "Random.h"
#include "Reader.h"
#include "Corrupt.h"
#include "Test.h"
#include <cstdlib>
#include <pthread.h>
extern "C" __declspec(dllexport)
void setInPath(char *path);
extern "C" __declspec(dllexport)
void setTrainPath(char *path);
extern "C" __declspec(dllexport)
void setValidPath(char *path);
extern "C" __declspec(dllexport)
void setTestPath(char *path);
extern "C" __declspec(dllexport)
void setEntPath(char *path);
extern "C" __declspec(dllexport)
void setRelPath(char *path);
extern "C" __declspec(dllexport)
void setOutPath(char *path);
extern "C" __declspec(dllexport)
void setWorkThreads(INT threads);
extern "C" __declspec(dllexport)
void setBern(INT con);
extern "C" __declspec(dllexport)
INT getWorkThreads();
extern "C" __declspec(dllexport)
INT getEntityTotal();
extern "C" __declspec(dllexport)
INT getRelationTotal();
extern "C" __declspec(dllexport)
INT getTripleTotal();
extern "C" __declspec(dllexport)
INT getTrainTotal();
extern "C" __declspec(dllexport)
INT getTestTotal();
extern "C" __declspec(dllexport)
INT getValidTotal();
extern "C" __declspec(dllexport)
void randReset();
extern "C" __declspec(dllexport)
void importTrainFiles();
struct Parameter {
    INT id;
    INT *batch_h;
    INT *batch_t;
    INT *batch_r;
    REAL *batch_y;
    INT batchSize;
    INT negRate;
    INT negRelRate;
    bool p;
    bool val_loss;
    INT mode;
    bool filter_flag;
};
void* getBatch(void* con) {
    // 将传入的参数直接变成结构体形式进行接收
    Parameter *para = (Parameter *)(con);
    // 将para中对应的值放到局部变量中
    INT id = para -> id;
    INT *batch_h = para -> batch_h;
    INT *batch_t = para -> batch_t;
    INT *batch_r = para -> batch_r;
    REAL *batch_y = para -> batch_y;
    INT batchSize = para -> batchSize;
    INT negRate = para -> negRate;
    INT negRelRate = para -> negRelRate;
    bool p = para -> p;
    // 判断模式是否为训练
    bool val_loss = para -> val_loss;
    INT mode = para -> mode;
    bool filter_flag = para -> filter_flag;
    // 既然要并行计算嘛，那么不同线程处理的数据范围肯定是不一样的，这里就是划分了一下每个线程的操作范围
    INT lef, rig;
    if (batchSize % workThreads == 0) {
        lef = id * (batchSize / workThreads);
        rig = (id + 1) * (batchSize / workThreads);
    } else {
        lef = id * (batchSize / workThreads + 1);    
        rig = (id + 1) * (batchSize / workThreads + 1);
        if (rig > batchSize) rig = batchSize;
    }
    // 一个阈值，决定负采样头部与负采样尾部的比率，500表示各自一半
    REAL prob = 500;
    // 开始采样负例三元组，用于训练
    if (val_loss == false) {
        for (INT batch = lef; batch < rig; batch++) {
            // rand的实现是赋予每个线程一个种子，这样每个线程可以生成自己的随机数（可能是因为所有线程都拿当前时间作为seed的话可能会造成重复？）
            INT i = rand_max(id, trainTotal);
            // 得到了处于0~三元组总数之间的一个随机id
            batch_h[batch] = trainList[i].h;
            batch_t[batch] = trainList[i].t;
            batch_r[batch] = trainList[i].r;
            batch_y[batch] = 1;
            INT last = batchSize;
            // 负采样实体negRate次
            for (INT times = 0; times < negRate; times ++) {
                // 模式为普通采样，即负采样头部与负采样尾部各占一半
                if (mode == 0){
                    if (bernFlag)
                        prob = 1000 * right_mean[trainList[i].r] / (right_mean[trainList[i].r] + left_mean[trainList[i].r]);
                    if (randd(id) % 1000 < prob) {
                        batch_h[batch + last] = trainList[i].h;
                        batch_t[batch + last] = corrupt_head(id, trainList[i].h, trainList[i].r);
                        batch_r[batch + last] = trainList[i].r;
                    } else {
                        batch_h[batch + last] = corrupt_tail(id, trainList[i].t, trainList[i].r);
                        batch_t[batch + last] = trainList[i].t;
                        batch_r[batch + last] = trainList[i].r;
                    }
                    batch_y[batch + last] = -1;
                    last += batchSize;
                }
                // 模式为头部采样，即仅负采样头部
                else {
                    if(mode == -1){
                        batch_h[batch + last] = corrupt_tail(id, trainList[i].t, trainList[i].r);
                        batch_t[batch + last] = trainList[i].t;
                        batch_r[batch + last] = trainList[i].r;
                    }
                    // 模式为尾部采样，即仅负采样尾部
                    else {
                        batch_h[batch + last] = trainList[i].h;
                        batch_t[batch + last] = corrupt_head(id, trainList[i].h, trainList[i].r);
                        batch_r[batch + last] = trainList[i].r;
                    }
                    batch_y[batch + last] = -1;
                    last += batchSize;
                }
            }
            // 负采样关系negRelRate次
            for (INT times = 0; times < negRelRate; times++) {
                batch_h[batch + last] = trainList[i].h;
                batch_t[batch + last] = trainList[i].t;
                batch_r[batch + last] = corrupt_rel(id, trainList[i].h, trainList[i].t, trainList[i].r, p);
                batch_y[batch + last] = -1;
                last += batchSize;
            }
        }
    }
    else
    {
        for (INT batch = lef; batch < rig; batch++)
        {
            batch_h[batch] = validList[batch].h;
            batch_t[batch] = validList[batch].t;
            batch_r[batch] = validList[batch].r;
            batch_y[batch] = 1;
        }
    }
    pthread_exit(NULL);
    return ((void*)0);
}
extern "C" __declspec(dllexport)
void sampling(
        INT *batch_h, 
        INT *batch_t, 
        INT *batch_r, 
        REAL *batch_y, 
        INT batchSize, 
        INT negRate = 1, 
        INT negRelRate = 0, 
        // mode=0代表普通采样，mode=1代表尾部采样，mode=-1代表头部采样
        INT mode = 0,
        bool filter_flag = true,
        bool p = false, 
        // val_loss应该代表的是是否为训练模式
        bool val_loss = false
) {
    // 根据线程数量，向内存分配指定的大小
    pthread_t *pt = (pthread_t *)malloc(workThreads * sizeof(pthread_t));
    // 根据线程数量，以及Parameter结构体的大小，向内存分配指定的大小
    Parameter *para = (Parameter *)malloc(workThreads * sizeof(Parameter));
    //初始化para结构体
    for (INT threads = 0; threads < workThreads; threads++) {
        para[threads].id = threads;
        para[threads].batch_h = batch_h;
        para[threads].batch_t = batch_t;
        para[threads].batch_r = batch_r;
        para[threads].batch_y = batch_y;
        para[threads].batchSize = batchSize;
        para[threads].negRate = negRate;
        para[threads].negRelRate = negRelRate;
        para[threads].p = p;
        para[threads].val_loss = val_loss;
        para[threads].mode = mode;
        para[threads].filter_flag = filter_flag;
        /*
        创建线程
        int pthread_create(
             pthread_t *restrict tidp,   //新创建的线程ID指向的内存单元。
             const pthread_attr_t *restrict attr,  //线程属性，默认为NULL
             void *(*start_rtn)(void *), //新创建的线程从start_rtn函数的地址开始运行
             void *restrict arg //默认为NULL。若上述函数需要参数，将参数放入结构中并将地址作为arg传入。
              );
        */
        pthread_create(&pt[threads], NULL, getBatch, (void*)(para+threads));
    }
    // 收工
    for (INT threads = 0; threads < workThreads; threads++)
        pthread_join(pt[threads], NULL);
    free(pt);
    free(para);
}
int main() {
    importTrainFiles();
    return 0;
}