【数据结构】3.跳表和散列

2023-10-03 13:02 由 imXuan 发表于 #其他

1.顺序链表字典

1.1字典抽象父类

#pragma once
using namespace std;

template<class K, class E>
class dictionary
{
public:
    virtual ~dictionary() {}
    // 返回字典是否为空
    virtual bool empty() const = 0;
    // 返回有多少键值对
    virtual int size() const = 0;
    // 根据K返回E
    virtual pair<const K, E>* find(const K&) const = 0;
    // 根据K删除键值对
    virtual void erase(const K&) = 0;
    // 插入一个键值对
    virtual void insert(const pair<const K, E>&) = 0;
};

1.2节点结构体

#pragma once
using namespace std;
template <class K, class E>
struct pairNode
{
    typedef pair<const K, E> pairType;
    pairType element;
    pairNode<K, E>* next;

    pairNode(const pairType& thePair) :element(thePair){}
    pairNode(const pairType& thePair, pairNode<K, E>* theNext):element(thePair)
    {
        next = theNext;
    }
};

1.3顺序链表字典实现

#pragma once
#include <iostream>
#include "dictionary.h"
#include "pairNode.h"

using namespace std;

template<class K, class E>
class sortedChain : public dictionary<K, E>
{
protected:
    pairNode<K, E>* firstNode; // 第一个结点
    int dSize;                 // 字典的大小

public:
    sortedChain() { firstNode = NULL; dSize = 0; }
    ~sortedChain();

    bool empty() const { return dSize == 0; }
    int size() const { return dSize; }
    pair<const K, E>* find(const K&) const;
    void erase(const K&);
    void insert(const pair<const K, E>&);
    void output();
};

template<class K, class E>
sortedChain<K, E>::~sortedChain()
{
    while (firstNode != NULL)
    {
        pairNode<K, E>* nextNode = firstNode->next;
        delete firstNode;
        firstNode = nextNode;
    }
}

template<class K, class E>
pair<const K, E>* sortedChain<K, E>::find(const K& theKey) const
{// 返回匹配的指针, 不存在则返回NULL
    pairNode<K, E>* currentNode = firstNode;

    while (currentNode != NULL && currentNode->element.first != theKey)
        currentNode = currentNode->next;

    if (currentNode != NULL && currentNode->element.first == theKey)
        return &currentNode->element;

    return NULL;
}

template<class K, class E>
void sortedChain<K, E>::insert(const pair<const K, E>& thePair)
{// 在字典中插入一个键值对, 并且覆盖已经存在的键值对
    pairNode<K, E>* p = firstNode;
    pairNode<K, E>* tp = NULL;

    while (p != NULL && p->element.first < thePair.first)
    {// 终止后要插入的结点在tp后, p之前
        tp = p;
        p = p->next;
    }

    if (p != NULL && p->element.first == thePair.first)
    {// 如果有匹配的键值对, 替换旧值
        p->element.second = thePair.second;
        return;
    }

    // 如果没有匹配的键值对, 创建一个新的节点, 该节点的下一个结点为p
    pairNode<K, E>* newNode = new pairNode<K, E>(thePair, p);

    if (tp == NULL) firstNode = newNode;
    else tp->next = newNode;

    dSize++;
    return;
}

template<class K, class E>
void sortedChain<K, E>::erase(const K& theKey)
{// 如果存在, 删除关键字为theKey的键值对
    pairNode<K, E>* p = firstNode;
    pairNode<K, E>* tp = NULL;

    // 如果存在, tp为theKey前一个结点, p为theKey结点
    while (p != NULL && p->element.first < theKey)
    {
        tp = p;
        p = p->next;
    }

    if (p != NULL && p->element.first == theKey)
    {
        if (tp == NULL) firstNode = p->next;
        else tp->next = p->next;

        delete p;
        dSize--;
    }
}
template<class K, class E>
void sortedChain<K, E>::output()
{
    pairNode<K, E>* p = firstNode;
    while (p != NULL)
    {
        cout << p->element.first << " " << p->element.second << " ";
        p = p->next;
    }
    cout << endl;
}

2.跳表

跳表可以近似实现二分查找的效果

以下面长度为7的链表举例，该跳表通过3条链表进行存储。假设要查找元素80：

首先在第2级链表查找，因为80大于40，所以在第3个节点右侧查找
然后在第1级链表查找，因为80大于75，所以在第5个节点右侧查找
最后在第0级链表查找，找到元素80

注意这里红色和绿色框起来的部分，在计算机中以数组的形式存储，也就是headerNode结构体里包含自身元素的值（头节点没有值），以及一个指针数组，数组0号位置存储的是元素20的指针，1号位置存储的是元素24的指针，2号位置存储的是元素40的指针

每次插入元素的时候，可以通过概率计算它被保存在第几级链表，他保存在第1级链表的概率应该为1/2，第2级链表的概率为1/4，之后为1/8以此类推，假设他保存在第2级，则应该为第0,1,2级的对应位置都插入一个节点，具体代码实现如下

2.1跳表节点结构体

#pragma once

template <class K, class E>
struct skipNode
{
    typedef pair<const K, E> pairType;
    pairType element;
    skipNode<K, E>** next;   // 指针数组

    skipNode(const pairType& thePair, int size):element(thePair) 
    {
        next = new skipNode<K, E>*[size];
    }
};

2.2跳表实现

#pragma once
#include <iostream>
#include <string>
#include "dictionary.h"
#include "skipNode.h"

using namespace std;

template<class K, class E>
class skipList : public dictionary<K, E>
{
protected:
    float cutOff;                           // 用来确定层数
    int level() const;                      // 用随机数计算插入节点在第几级链表
    int levels;                             // 当前最大的非空链表
    int dSize;                              // 键值对个数
    int maxLevel;                           // 允许的最大链表层数
    K tailKey;                              // 最大关键字
    skipNode<K, E>* search(const K&) const; // 根据Key查找节点
    skipNode<K, E>* headerNode;             // 头节点指针（每一级的头节点)
    skipNode<K, E>* tailNode;               // 尾节点指针（每一级的尾节点)
    skipNode<K, E>** last;                  // last[i] 表示第 i 层的最后一个结点

public:
    skipList(K, int maxPairs = 10000, float prob = 0.5);
    ~skipList();

    bool empty() const { return dSize == 0; }
    int size() const { return dSize; }
    pair<const K, E>* find(const K&) const;
    void erase(const K&);
    void insert(const pair<const K, E>&);
    void output() const;

};

template<class K, class E>
skipList<K, E>::skipList(K largeKey, int maxPairs, float prob)
{// 构造函数, 关键字小于largeKey, 且个数最多为maxPairs, 概率因子(0, 1)
    cutOff = prob * RAND_MAX;
    maxLevel = (int)ceil(logf((float)maxPairs) / logf(1 / prob)) - 1;
    levels = 0;  // 初始化层数
    dSize = 0;
    tailKey = largeKey;

    // 创建头结点尾结点和数组last
    pair<K, E> tailPair;
    tailPair.first = tailKey;
    headerNode = new skipNode<K, E>(tailPair, maxLevel + 1);
    tailNode = new skipNode<K, E>(tailPair, 0);
    last = new skipNode<K, E> *[maxLevel + 1];

    // 链表为空时, 所有头节点都指向尾节点
    for (int i = 0; i <= maxLevel; i++)
        headerNode->next[i] = tailNode;
}

template<class K, class E>
skipList<K, E>::~skipList()
{// 删除所有节点以及last数组
    skipNode<K, E>* nextNode;

    // 根据第0级删除所有节点
    while (headerNode != tailNode)
    {
        nextNode = headerNode->next[0];
        delete headerNode;
        headerNode = nextNode;
    }
    delete tailNode;

    delete[] last;
}

template<class K, class E>
pair<const K, E>* skipList<K, E>::find(const K& theKey) const
{// 返回匹配的 键值对指针 或 NULL
    if (theKey >= tailKey)
        return NULL;

    // 位置 beforeNode 是关键字为 theKey 的节点之前最右边的位置
    skipNode<K, E>* beforeNode = headerNode;
    for (int i = levels; i >= 0; i--)                       // 自上而下: 从上级链表向下查找
        while (beforeNode->next[i]->element.first < theKey) // 自左而右: 第 i 级的指针链表一直向右移动
            beforeNode = beforeNode->next[i];

    if (beforeNode->next[0]->element.first == theKey)
        return &beforeNode->next[0]->element;

    return NULL;
}

template<class K, class E>
int skipList<K, E>::level() const
{// 假设prob是1/2, 这里就是它是第几级链表的概率, 级越高概率越低, 数据量规模足够大时每层都是下面一层的1/2
    int lev = 0;
    while (rand() <= cutOff)    // 每次小于它的概率都是1/2
        lev++;
    cout << "它位于第" << lev << "级链表" << endl;
    return (lev <= maxLevel) ? lev : maxLevel;
}

template<class K, class E>
skipNode<K, E>* skipList<K, E>::search(const K& theKey) const
{// 搜索关键字 theKey, 把每一级链表中要查看的最后一个节点存储在 last 中
 // 返回 theKey 对应的键值对（如果有的话，在insert中进行了判断）
    skipNode<K, E>* beforeNode = headerNode;
    for (int i = levels; i >= 0; i--)
    {
        while (beforeNode->next[i]->element.first < theKey)
            beforeNode = beforeNode->next[i];
        last[i] = beforeNode;  // 把每一层theKey的前一个结点都存储到last数组里
    }
    return beforeNode->next[0];
}

template<class K, class E>
void skipList<K, E>::insert(const pair<const K, E>& thePair)
{// 把 thePair 插入字典, 或覆盖已有关键字的键值对
    if (thePair.first >= tailKey)
    {
        cout << "关键字过大" << endl;
        return;
    }

    // 查看关键字是否已经存在
    skipNode<K, E>* theNode = search(thePair.first);
    if (theNode->element.first == thePair.first)
    {   // 已经存在则更新键值对的值
        theNode->element.second = thePair.second;
        return;
    }

    // 不存在则插入新的键值对
    int theLevel = level(); // 新节点存在于几级链表
    // 如果这个链表级别太高了, 让他只变为当前链表等级+1
    if (theLevel > levels)
    {
        theLevel = ++levels;
        last[theLevel] = headerNode;
    }

    skipNode<K, E>* newNode = new skipNode<K, E>(thePair, theLevel + 1);
    for (int i = 0; i <= theLevel; i++)
    {// 在每一级插入该节点
        newNode->next[i] = last[i]->next[i];
        last[i]->next[i] = newNode;
    }

    dSize++;
    return;
}

template<class K, class E>
void skipList<K, E>::erase(const K& theKey)
{// 删除关键字为theKey的数对
    if (theKey >= tailKey)
        return;

    // 判断是否有匹配的键值对
    skipNode<K, E>* theNode = search(theKey);
    if (theNode->element.first != theKey)
        return;

    // 从跳表删除键值对
    for (int i = 0; i <= levels && last[i]->next[i] == theNode; i++)
        last[i]->next[i] = theNode->next[i];
 
    // 更新调表级数
    while (levels > 0 && headerNode->next[levels] == tailNode)
        levels--;

    delete theNode;
    dSize--;
}

template<class K, class E>
void skipList<K, E>::output() const 
{
    skipNode<K, E>* p;
    for (int i = levels; i >= 0; i--)
    {
        p = headerNode;
        while (p->next[i]->element.first < tailKey)
        {
            p = p->next[i];
            cout << p->element.second << ", ";
        }
        cout << endl;
    }
}

3.散列

3.1散列表描述

字典的另一种表示方法是散列（hashing），它用一个散列函数（哈希函数）把字典的键值对映射到一个散列表中，键值对为p，关键字为k，则p = f(k)

3.2散列函数和散列表

桶和起始桶：散列表的每一个位置叫一个桶（bucket），f(k) 是起始桶（home bucket），桶的数量等于散列表的长度。散列函数可以将多个关键字映射到同一个桶，所以桶可以容纳多个键值对。散列函数中最常用的是除法散列函数，其中 k 是关键字，D 是散列表的长度 f(k) = k % D
冲突和溢出：假设两个关键字对应的起始桶相同，就是发生了冲突（collision），但一个桶可以存放多个键值对，所以只要桶足够大，就没什么问题。但如果没有空间存储一个新键值对时，就发生了溢出（overflow）
找一个好的散列函数：如果映射到散列表中任何一个桶的关键字数量大致相等，发生冲突和溢出的平均数就小，均匀散列函数（uniform hash function）就是这样的函数

size_t operator()(const string theKey)
{
    unsigned long hashValue = 0;
    int length = (int) theKey.length();
    for(int i=0; i<length; i++)
        hashValue = 5 * hashValue + theKey.at(i);
    return size_t(hashValue);
}

3.3 线性探查

我们有一个函数 p=f(k) 是p = k % 11 ，首先插入 80 % 11 = 3，然后再想插入 58 % 11 = 3 时发现已经满了，最简单的办法是找到下一个可以存放该数据的桶，这种方法叫线性探查（Linear Probing）

我们把58存储在4号桶；然后插入24，2号桶为空所以插入2号桶；接下来插入35，用线性探查插入到5号桶；最后插入98时由于10号桶已有数据，所以插入到0号桶。这种情况下散列可以看成是循环链表

由此可以确定搜索方法，首先计算起始搜索桶f(k)

如果关键字k的桶已找到, 则找到了键值对
到达一个空桶, 说明未找到
回到起始桶f(k), 说明未找到

3.4 散列的数组实现

实现一个hash类，将key转换成非负的整数

#pragma once
#include <iostream>
#include <string>

using namespace std;

// 把 Key 转换成非负整数
template<class K> class myhash;
template<>
class myhash<string>
{
public:
    size_t operator()(const string theKey) const
    {
        unsigned long hashValue = 0;
        int length = (int)theKey.length();
        for (int i = 0; i < length; i++)
            hashValue = 5 * hashValue + theKey.at(i);

        return size_t(hashValue);
    }
};

template<>
class myhash<int>
{
public:
    size_t operator()(const int theKey) const
    {
        return size_t(theKey);
    }
};

template<>
class myhash<char>
{
public:
    size_t operator()(const char theKey) const
    {
        int ret = (int)theKey;
        return size_t(ret);
    }
};

数组散列的实现

#pragma once
#include <iostream>
#include "hash.h"

using namespace std;

template<class K, class E>
class hashTable
{
protected:
    int search(const K&) const;
    pair<const K, E>** table;  // 哈希表
    myhash<K> hash;            // 将K转换为非负整数
    int dSize;                 // 字典的大小
    int divisor;               // 节点个数

public:
    hashTable(int theDivisor = 11);
    ~hashTable() { delete[] table; }

    bool empty() const { return dSize == 0; }
    int size() const { return dSize; }
    pair<const K, E>* find(const K&) const;
    void insert(const pair<const K, E>&);
    void output() const;


};

template<class K, class E>
hashTable<K, E>::hashTable(int theDivisor)
{
    divisor = theDivisor;
    dSize = 0;

    // 分配数组的空间
    table = new pair<const K, E>*[divisor];
    for (int i = 0; i < divisor; i++)
        table[i] = NULL;
}

template<class K, class E>
int hashTable<K, E>::search(const K& theKey) const
{// 查找关键字为 theKey 的键值对, 如果存在匹配的键值对则返回他的位置, 如果不存在则返回可以插入的位置

    int i = (int)hash(theKey) % divisor;  // 起始桶
    int j = i;    // j 从起始桶开始搜索
    do
    {
        if (table[j] == NULL || table[j]->first == theKey)
            return j;
        j = (j + 1) % divisor;  // j 移动到下一个桶
    } while (j != i);           // 回到起始桶位置

    return j;  // 表满
}

template<class K, class E>
pair<const K, E>* hashTable<K, E>::find(const K& theKey) const
{// 返回匹配的指针或NULL

    int b = search(theKey);

    // 空或者和Key不相等都说明没有匹配的键值对
    if (table[b] == NULL || table[b]->first != theKey)
        return NULL;

    return table[b];  // 匹配到键值对
}

template<class K, class E>
void hashTable<K, E>::insert(const pair<const K, E>& thePair)
{// 插入thePair, 若存在相同的则更新值, 表满打印提示

    int b = search(thePair.first);

    if (table[b] == NULL)
    {
        // 没有匹配的键值对则插入
        table[b] = new pair<const K, E>(thePair);
        dSize++;
    }
    else
    {
        if (table[b]->first == thePair.first)
        {// 匹配的键值对相等则更新值
            table[b]->second = thePair.second;
        }
        else // 不相等说明表已经满
            cout << "散列表已满" << endl;
    }
}

template<class K, class E>
void hashTable<K, E>::output() const
{
    for (int i = 0; i < divisor; i++)
    {
        if (table[i] == NULL)
            cout << "NULL" << endl;
        else
            cout << table[i]->first << " "
            << table[i]->second << endl;
    }
}

3.5 散列的链式实现

每个桶可以无限增加记录，所以链表实现解决了溢出问题

#pragma once
#include <iostream>
#include "hash.h"           // 将K转换为非负整数
#include "dictionary.h"     // 字典
#include "sortedChain.hpp"  // 顺序链表字典

using namespace std;

template<class K, class E>
class hashChains : public dictionary<K, E>
{
protected:
    sortedChain<K, E>* table;  // 散列表
    myhash<K> hash;            // 将K转换为非负整数
    int dSize;                 // 元素个数
    int divisor;               // 节点个数

public:
    hashChains(int theDivisor = 11)
    {
        divisor = theDivisor;
        dSize = 0;

        // 初始化一个顺序链表字典的数组作为散列表
        table = new sortedChain<K, E>[divisor];
    }

    ~hashChains() { delete[] table; }

    bool empty() const { return dSize == 0; }
    int size() const { return dSize; }

    pair<const K, E>* find(const K& theKey) const
    {// 只需要确定初始桶的位置然后调用顺序链表字典的查找即可
        return table[hash(theKey) % divisor].find(theKey);
    }

    void insert(const pair<const K, E>& thePair)
    {
        int homeBucket = (int)hash(thePair.first) % divisor;    // 初始桶位置
        int homeSize = table[homeBucket].size();                // 当前该桶大小
        table[homeBucket].insert(thePair);                      // 插入thePair
        if (table[homeBucket].size() > homeSize)                // 如果桶大小发生改变
            dSize++;                                            // 字典元素个数++
    }

    void erase(const K& theKey)
    {
        table[hash(theKey) % divisor].erase(theKey);
    }

    void output() const
    {
        for (int i = 0; i < divisor; i++)
            if (table[i].size() == 0)
                cout << "NULL" << endl;
            else
                table[i].output();
    }
};