diff --git a/.gitignore b/.gitignore index 2ed53e5365d17..88b42cfe566cf 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ +.cproject +.project +Debug/* *-t *.a *.ctest diff --git a/CMakeLists.txt b/CMakeLists.txt index e80f8d628fc11..22d8569cb444b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -363,6 +363,7 @@ IF(WITH_UNIT_TESTS) ADD_SUBDIRECTORY(unittest/examples) ADD_SUBDIRECTORY(unittest/mysys) ADD_SUBDIRECTORY(unittest/my_decimal) + ADD_SUBDIRECTORY(unittest/eperi) IF(NOT WITHOUT_SERVER) ADD_SUBDIRECTORY(unittest/sql) ENDIF() diff --git a/dbug/dbug.c b/dbug/dbug.c index dffd7a44cd8f0..b26500ee6a515 100644 --- a/dbug/dbug.c +++ b/dbug/dbug.c @@ -85,6 +85,7 @@ #undef SAFE_MUTEX #include #include +#include #ifndef DBUG_OFF @@ -2184,6 +2185,51 @@ const char* _db_get_func_(void) return cs->func; } + +void dump_buffer(unsigned n, const unsigned char* buf) { +int on_this_line = 0; +int counter = 0; +int cc =0; +char ch =0; + +FILE* stream = stderr; +fflush(stream); +fprintf(stream, "%06X: ", counter); +while (n-- > 0) { + fprintf(stream, "%02X ", *buf++); + on_this_line += 1; + if (on_this_line == 16 || n == 0) { + int i; + fprintf(stream, " "); + int cc = on_this_line; + if (cc != 16) { + + + for (i = on_this_line; i < 16; i++) { + fprintf(stream," " ); + } + } + for (i = on_this_line; i > 0; i--) { + ch =isprint(buf[-i]) ? buf[-i] : '.'; + fprintf(stream,"%c",ch); + } + + fprintf(stream,"\n" ); + + on_this_line = 0; + if (n!=0) fprintf(stream, "%06X: ", ++counter); + + + } else { + counter++; + } +} +fprintf( stream, "\n"); +fflush(stream); +} + + + #else /* diff --git a/include/keyfile.h b/include/keyfile.h new file mode 100644 index 0000000000000..d55c79086d47c --- /dev/null +++ b/include/keyfile.h @@ -0,0 +1,38 @@ +/* Copyright (C) 2014 eperi GmbH. All Rights Reserved. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ + +/******************************************************************/ +#ifndef KEYFILE_H +#define KEYFILE_H +#include + +struct keyentry { + int id; + char *iv; + char *key; +}; + +int +parseFile(FILE * fp, struct keyentry **allKeys, const int k_len, const char *secret); + +int +parseLine(const char *line, struct keyentry *entry, const int k_len); + +int +isComment(char *line); + +char* +trim(char *in); +#endif diff --git a/include/my_aes.h b/include/my_aes.h index 58a7891902338..c628bfa21ab82 100644 --- a/include/my_aes.h +++ b/include/my_aes.h @@ -1,6 +1,11 @@ #ifndef MY_AES_INCLUDED #define MY_AES_INCLUDED +#define AES_OK 0 +#define AES_BAD_DATA -1 +#define AES_BAD_KEYSIZE -5 +#define AES_KEY_CREATION_FAILED -10 + /* Copyright (c) 2002, 2006 MySQL AB, 2009 Sun Microsystems, Inc. Use is subject to license terms. @@ -27,6 +32,47 @@ C_MODE_START #define AES_KEY_LENGTH 128 /* Must be 128 192 or 256 */ + +/* + my_aes_encrypt_cbc- Crypt buffer with AES encryption algorithm using cbc mode. + source - Pointer to data for encryption + source_length - size of encryption data + dest - buffer to place encrypted data (must be large enough) + key - Key to be used for encryption + kel_length - Length of the key. Will handle keys of any length + + returns - size of encrypted data, or negative in case of error. +*/ +int my_aes_encrypt_cbc(const char* source, unsigned long int source_length, + char* dest, unsigned long int *dest_length, + const unsigned char* key, uint8 key_length, + const unsigned char* iv, uint8 iv_length); + + +/** + * Calculate key and iv from a given salt and secret as it is handled in openssl encrypted files via console + * + * SYNOPSIS + * my_Bytes_To_Key() + * @param salt [in] the given salt as extracted from the encrypted file + * @param secret [in] the given secret as String, provided by the user + * @param key [out] 32 Bytes of key are written to this pointer + * @param iv [out] 16 Bytes of iv are written to this pointer + */ +void my_bytes_to_key(const unsigned char *salt, + const char *secret, unsigned char *key, + unsigned char *iv); +/** + Decode Hexencoded String to uint8[]. + my_aes_hexToUint() + @param iv [in] Pointer to hexadecimal encoded IV String + @param dest [out] Pointer to output uint8 array. Memory needs to be allocated by caller + @param iv_length [in] Size of destination array. + */ +void my_aes_hexToUint(const char* in, + unsigned char *out, + int dest_length); + /* my_aes_encrypt - Crypt buffer with AES encryption algorithm. source - Pointer to data for encryption @@ -41,6 +87,23 @@ C_MODE_START int my_aes_encrypt(const char *source, int source_length, char *dest, const char *key, int key_length); +/* + my_aes_decrypt_cbc - DeCrypt buffer with AES encryption algorithm using + cbc Mode. + source - Pointer to data for decryption + source_length - size of encrypted data + dest - buffer to place decrypted data (must be large enough) + key - Key to be used for decryption + kel_length - Length of the key. Will handle keys of any length + + returns - size of original data, or negative in case of error. +*/ + +int my_aes_decrypt_cbc(const char* source, unsigned long int source_length, + char* dest, unsigned long int *dest_length, + const unsigned char* key, uint8 key_length, + const unsigned char* iv, uint8 iv_length); + /* my_aes_decrypt - DeCrypt buffer with AES encryption algorithm. source - Pointer to data for decryption diff --git a/include/my_dbug.h b/include/my_dbug.h index bcf2015466dec..3837e35f1417a 100644 --- a/include/my_dbug.h +++ b/include/my_dbug.h @@ -52,6 +52,9 @@ extern void _db_return_(uint _line_, struct _db_stack_frame_ *_stack_frame_); extern void _db_pargs_(uint _line_,const char *keyword); extern void _db_doprnt_(const char *format,...) ATTRIBUTE_FORMAT(printf, 1, 2); + +extern void dump_buffer(unsigned n, const unsigned char* buf); + extern void _db_dump_(uint _line_,const char *keyword, const unsigned char *memory, size_t length); extern void _db_end_(void); diff --git a/mysql-test/r/enc.result b/mysql-test/r/enc.result new file mode 100644 index 0000000000000..46b53558e23ae --- /dev/null +++ b/mysql-test/r/enc.result @@ -0,0 +1,20 @@ +DROP TABLE IF EXISTS t1; +DROP DATABASE IF EXISTS test; +CREATE DATABASE test; +USE test; +set @save_storage_engine= @@storage_engine; +set storage_engine=InnoDB; +CREATE TABLE t1 (id int) +PAGE_ENCRYPTION='abc'; +ERROR HY000: Incorrect value 'abc' for option 'PAGE_ENCRYPTION' +CREATE TABLE t1 (id int) +PAGE_ENCRYPTION=1 +PAGE_ENCRYPTION_KEY='0xFFC'; +ERROR HY000: Incorrect value '0xFFC' for option 'PAGE_ENCRYPTION_KEY' +CREATE TABLE t1 (id int(11)) +PAGE_ENCRYPTION=1 +PAGE_ENCRYPTION_KEY=42; +INSERT INTO t1(id) values(1); +SELECT * FROM t1; +id +1 diff --git a/mysql-test/t/enc.test b/mysql-test/t/enc.test new file mode 100644 index 0000000000000..6e93d4765d816 --- /dev/null +++ b/mysql-test/t/enc.test @@ -0,0 +1,28 @@ +-- source include/have_xtradb.inc + +--disable_warnings +DROP TABLE IF EXISTS t1; +DROP DATABASE IF EXISTS test; +--enable_warnings + +CREATE DATABASE test; +USE test; +set @save_storage_engine= @@storage_engine; +set storage_engine=InnoDB; + +--error ER_BAD_OPTION_VALUE +CREATE TABLE t1 (id int) + PAGE_ENCRYPTION='abc'; + +--error ER_BAD_OPTION_VALUE +CREATE TABLE t1 (id int) + PAGE_ENCRYPTION=1 + PAGE_ENCRYPTION_KEY='0xFFC'; + +CREATE TABLE t1 (id int(11)) + PAGE_ENCRYPTION=1 + PAGE_ENCRYPTION_KEY=42; + +INSERT INTO t1(id) values(1); +SELECT * FROM t1; + diff --git a/mysys/CMakeLists.txt b/mysys/CMakeLists.txt index f0d25dae6b93d..83060bee88da2 100644 --- a/mysys/CMakeLists.txt +++ b/mysys/CMakeLists.txt @@ -39,7 +39,7 @@ SET(MYSYS_SOURCES array.c charset-def.c charset.c checksum.c my_default.c my_atomic.c my_getncpus.c my_safehash.c my_chmod.c my_rnd.c my_uuid.c wqueue.c waiting_threads.c ma_dyncol.c my_rdtsc.c my_context.c psi_noop.c - file_logger.c) + file_logger.c ) IF (WIN32) SET (MYSYS_SOURCES ${MYSYS_SOURCES} my_winthread.c my_wincond.c my_winerr.c my_winfile.c my_windac.c my_conio.c) @@ -68,7 +68,7 @@ IF(HAVE_MLOCK) ENDIF() ADD_CONVENIENCE_LIBRARY(mysys ${MYSYS_SOURCES}) -TARGET_LINK_LIBRARIES(mysys dbug strings ${ZLIB_LIBRARY} +TARGET_LINK_LIBRARIES(mysys dbug strings mysys_ssl ${ZLIB_LIBRARY} ${LIBNSL} ${LIBM} ${LIBRT} ${LIBSOCKET} ${LIBEXECINFO}) DTRACE_INSTRUMENT(mysys) diff --git a/mysys_ssl/my_aes.cc b/mysys_ssl/my_aes.cc index 9327bc32a3b60..012966d0626cf 100644 --- a/mysys_ssl/my_aes.cc +++ b/mysys_ssl/my_aes.cc @@ -24,6 +24,8 @@ #elif defined(HAVE_OPENSSL) #include #include +#include +#include // Wrap C struct, to ensure resources are released. struct MyCipherCtx @@ -42,6 +44,7 @@ enum encrypt_dir { MY_AES_ENCRYPT, MY_AES_DECRYPT }; /* If bad data discovered during decoding */ #define AES_BAD_DATA -1 + /** This is internal function just keeps joint code of Key generation @@ -101,7 +104,43 @@ static int my_aes_create_key(const char *key, int key_length, uint8 *rkey) return 0; } +/** + Decode Hexencoded String to uint8[]. + my_aes_hexToUint() + @param iv [in] Pointer to hexadecimal encoded IV String + @param dest [out] Pointer to output uint8 array. Memory needs to be allocated by caller + @param iv_length [in] Size of destination array. + */ +void +my_aes_hexToUint(const char* in, unsigned char *out, int dest_length) +{ + const char *pos = in; + int count = 0; + for(count = 0; count < dest_length; count++) + { + sscanf(pos, "%2hhx", &out[count]); + pos += 2 * sizeof(char); + } +} + +/** + * Calculate key and iv from a given salt and secret as it is handled in openssl encrypted files via console + * + * SYNOPSIS + * my_Bytes_To_Key() + * @param salt [in] the given salt as extracted from the encrypted file + * @param secret [in] the given secret as String, provided by the user + * @param key [out] 32 Bytes of key are written to this pointer + * @param iv [out] 16 Bytes of iv are written to this pointer + */ +void +my_bytes_to_key(const unsigned char *salt, const char *secret, unsigned char *key, unsigned char *iv) +{ + const EVP_CIPHER *type = EVP_aes_256_cbc(); + const EVP_MD *digest = EVP_sha1(); + EVP_BytesToKey(type, digest, salt, (unsigned char*) secret, strlen(secret), 1, key, iv); +} /** Crypt buffer with AES encryption algorithm. @@ -117,8 +156,94 @@ static int my_aes_create_key(const char *key, int key_length, uint8 *rkey) >= 0 Size of encrypted data < 0 Error */ +int my_aes_encrypt_cbc(const char* source, unsigned long int source_length, + char* dest, unsigned long int *dest_length, + const unsigned char* key, uint8 key_length, + const unsigned char* iv, uint8 iv_length) +{ +#if defined(HAVE_OPENSSL) + MyCipherCtx ctx; + int u_len, f_len; + /* The real key to be used for encryption */ + const EVP_CIPHER* cipher; + switch(key_length) { + case 16: + cipher = EVP_aes_128_cbc(); + break; + case 24: + cipher = EVP_aes_192_cbc(); + break; + case 32: + cipher = EVP_aes_256_cbc(); + break; + default: + return AES_BAD_KEYSIZE; + } + //Initialize Encryption Engine here, default software Engine is default + ENGINE *engine = NULL; + + if (! EVP_EncryptInit_ex(&ctx.ctx, cipher, engine, key, iv)) + return AES_BAD_DATA; /* Error */ + EVP_CIPHER_CTX_key_length(&ctx.ctx); + OPENSSL_assert(EVP_CIPHER_CTX_key_length(&ctx.ctx) == key_length); + OPENSSL_assert(EVP_CIPHER_CTX_iv_length(&ctx.ctx) == iv_length); + OPENSSL_assert(EVP_CIPHER_CTX_block_size(&ctx.ctx) == 16); + if (! EVP_EncryptUpdate(&ctx.ctx, (unsigned char *) dest, &u_len, + (unsigned const char *) source, source_length)) + return AES_BAD_DATA; /* Error */ + if (! EVP_EncryptFinal_ex(&ctx.ctx, (unsigned char *) dest + u_len, &f_len)) + return AES_BAD_DATA; /* Error */ + *dest_length = (unsigned long int) (u_len + f_len); +#endif + return AES_OK; +} + +int my_aes_decrypt_cbc(const char* source, unsigned long int source_length, + char* dest, unsigned long int *dest_length, + const unsigned char* key, uint8 key_length, + const unsigned char* iv, uint8 iv_length) +{ +#if defined(HAVE_OPENSSL) + MyCipherCtx ctx; + int u_len, f_len; + + const EVP_CIPHER* cipher; + switch(key_length) { + case 16: + cipher = EVP_aes_128_cbc(); + break; + case 24: + cipher = EVP_aes_192_cbc(); + break; + case 32: + cipher = EVP_aes_256_cbc(); + break; + default: + return AES_BAD_KEYSIZE; + } + //Initialize Encryption Engine here, default software Engine is default + ENGINE *engine = NULL; + + if (! EVP_DecryptInit_ex(&ctx.ctx, cipher, engine, key, iv)) + return AES_BAD_DATA; /* Error */ + OPENSSL_assert(EVP_CIPHER_CTX_key_length(&ctx.ctx) == key_length); + OPENSSL_assert(EVP_CIPHER_CTX_iv_length(&ctx.ctx) == iv_length); + OPENSSL_assert(EVP_CIPHER_CTX_block_size(&ctx.ctx) == 16); + if (! EVP_DecryptUpdate(&ctx.ctx, (unsigned char *) dest, &u_len, + (unsigned char *)source, source_length)) + return AES_BAD_DATA; /* Error */ + if (! EVP_DecryptFinal_ex(&ctx.ctx, (unsigned char *) dest + u_len, &f_len)) { + *dest_length = (unsigned long int) u_len; + return AES_BAD_DATA; + } + *dest_length = (unsigned long int) (u_len + f_len); +#endif + return AES_OK; +} + -int my_aes_encrypt(const char* source, int source_length, char* dest, +int +my_aes_encrypt(const char* source, int source_length, char* dest, const char* key, int key_length) { #if defined(HAVE_YASSL) diff --git a/sql/mysqld.cc b/sql/mysqld.cc index 899140f7d7147..d24e9570ba349 100644 --- a/sql/mysqld.cc +++ b/sql/mysqld.cc @@ -14,6 +14,8 @@ along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA */ +#include "../storage/xtradb/include/KeySingleton.h" + #include "sql_plugin.h" #include "sql_priv.h" #include "unireg.h" @@ -696,7 +698,7 @@ MY_LOCALE *my_default_lc_time_names; SHOW_COMP_OPTION have_ssl, have_symlink, have_dlopen, have_query_cache; SHOW_COMP_OPTION have_geometry, have_rtree_keys; -SHOW_COMP_OPTION have_crypt, have_compress; +SHOW_COMP_OPTION have_crypt, have_datacrypt, have_compress; SHOW_COMP_OPTION have_profiling; SHOW_COMP_OPTION have_openssl; @@ -5717,6 +5719,10 @@ int mysqld_main(int argc, char **argv) mysql_cond_signal(&COND_server_started); mysql_mutex_unlock(&LOCK_server_started); + KeySingleton& ksp2 = KeySingleton::getInstance(); + struct keyentry *entry = ksp2.getKeys(2); + if(entry) printf("id:%3u \tiv:%s \tkey:%s\n", entry->id, entry->iv, entry->key); + #if defined(_WIN32) || defined(HAVE_SMEM) handle_connections_methods(); #else @@ -8481,6 +8487,11 @@ static int mysql_init_variables(void) #else have_crypt=SHOW_OPTION_NO; #endif +#ifdef HAVE_DATACRYPT + have_datacrypt=SHOW_OPTION_YES; +#else + have_datacrypt=SHOW_OPTION_NO; +#endif #ifdef HAVE_COMPRESS have_compress= SHOW_OPTION_YES; #else diff --git a/storage/xtradb/CMakeLists.txt b/storage/xtradb/CMakeLists.txt index e34add6188603..a5d3baa10fd7c 100644 --- a/storage/xtradb/CMakeLists.txt +++ b/storage/xtradb/CMakeLists.txt @@ -30,6 +30,7 @@ MYSQL_CHECK_BZIP2() # OS tests IF(UNIX) + IF(CMAKE_SYSTEM_NAME STREQUAL "Linux") CHECK_INCLUDE_FILES (libaio.h HAVE_LIBAIO_H) IF (XTRADB_PREFER_STATIC_LIBAIO) @@ -312,10 +313,14 @@ SET(INNOBASE_SOURCES dict/dict0stats.cc dict/dict0stats_bg.cc dyn/dyn0dyn.cc + enc/EncKeys.cc + enc/KeySingleton.cc + enc/keyfile.c eval/eval0eval.cc eval/eval0proc.cc fil/fil0fil.cc fil/fil0pagecompress.cc + fil/fil0pageencryption.cc fsp/fsp0fsp.cc fut/fut0fut.cc fut/fut0lst.cc diff --git a/storage/xtradb/buf/buf0buf.cc b/storage/xtradb/buf/buf0buf.cc index 359b15f4a6b8c..57146a694847a 100644 --- a/storage/xtradb/buf/buf0buf.cc +++ b/storage/xtradb/buf/buf0buf.cc @@ -57,6 +57,9 @@ Created 11/5/1995 Heikki Tuuri #include "trx0trx.h" #include "srv0start.h" +#include "fil0pageencryption.h" + + /* prototypes for new functions added to ha_innodb.cc */ trx_t* innobase_get_trx(); @@ -528,12 +531,13 @@ buf_page_is_corrupted( ulint zip_size) /*!< in: size of compressed page; 0 for uncompressed pages */ { + ulint page_encrypted = fil_page_is_encrypted(read_buf); ulint checksum_field1; ulint checksum_field2; ibool crc32_inited = FALSE; ib_uint32_t crc32 = ULINT32_UNDEFINED; - if (!zip_size + if (!page_encrypted && !zip_size && memcmp(read_buf + FIL_PAGE_LSN + 4, read_buf + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM + 4, 4)) { @@ -586,6 +590,9 @@ buf_page_is_corrupted( if (zip_size) { return(!page_zip_verify_checksum(read_buf, zip_size)); } + if (page_encrypted) { + return (FALSE); + } checksum_field1 = mach_read_from_4( read_buf + FIL_PAGE_SPACE_OR_CHKSUM); diff --git a/storage/xtradb/enc/EncKeys.cc b/storage/xtradb/enc/EncKeys.cc new file mode 100644 index 0000000000000..8c3464295f5cb --- /dev/null +++ b/storage/xtradb/enc/EncKeys.cc @@ -0,0 +1,309 @@ +/* Copyright (C) 2014 eperi GmbH. All Rights Reserved. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ + +/******************************************************************//** + @file EncKeys.cc + A class to keep keys for encryption/decryption. + + Created 09/15/2014 Florin Fugaciu + ***********************************************************************/ + +#include "EncKeys.h" +#include +#include +#include +#include +#include +#include + + +/* + Die Schlüsseldatei kann Fehler beinhalten. Folgende Fehler werden abgefangen: + 1. Doppelte SchlüsselIDs: + 1.1. Gleiche Schlüssel --> Meldung ausgeben mit dem Hinweis auf die Zeilennummern + 1.2. Ungleiche Schlüssel --> Meldung ausgeben mit dem Hinweis auf die Zeilennummern + und Wahl des Schlüssel mit der kleinsten Zeilennummer + 2. Schlüsseldatei ist zu groß --> Meldung ausgeben und abbrechen + 3. Fehler im Schlüssel --> Meldung ausgeben und Schlüssel auf Nicht-Vorhanden setzen. Meldungen: + 3.1. Schlüssel zu groß + 3.2. Keine Übereinstimmung des Schlüssels mit den Vorgaben + 4. Schlüsselserver noch nicht implementiert --> Meldung ausgeben mit dem Hinweis auf das + Nicht-lesen-können von verschlüsselten Tabellen und Spalten. + + Anmerkung: voerst keine Hinweise auf die Zeilennummern +*/ + +const char* EncKeys::strMAGIC = "Salted__"; +const int EncKeys::magicSize = strlen(strMAGIC); // 8 byte +const char* EncKeys::newLine = "\n"; + +const char* EncKeys::errorNoKeyId = "KeyID = %u not found or with error. Check the key and the log file.\n"; +const char* EncKeys::errorInMatches = "Wrong match of the keyID in line %u, see the template.\n"; +const char* EncKeys::errorExceedKeyFileSize = "The size of the key file %s exceeds " + "the maximum allowed of %u bytes.\n"; +const char* EncKeys::errorExceedKeySize = "The key size exceeds the maximum allowed size of %u in line %u.\n"; +const char* EncKeys::errorEqualDoubleKey = "More than one identical key with keyID = %u found" + " in lines %u and %u.\nDelete one of them in the key file.\n"; +const char* EncKeys::errorUnequalDoubleKey = "More than one not identical key with keyID = %u found" + " in lines %u and %u.\nChoose the right one and delete the other in the key file.\n" + "I'll take the key from line %u\n"; +const char* EncKeys::errorNoInitializedKey = "The key could not be initialized.\n"; +const char* EncKeys::errorNotImplemented = "Initializing keys through key server is not" + " yet implemented.\nYou can not read encrypted tables or columns\n\n"; +const char* EncKeys::errorOpenFile = "Could not open %s for reading. You can not read encrypted tables or columns.\n\n"; +const char* EncKeys::errorReadingFile = "Could not read from %s. You can not read encrypted tables or columns\n\n"; +const char* EncKeys::errorFileSize = "Could not get the file size from %s. You can not read encrypted tables or columns\n\n"; +const char* EncKeys::errorFalseFileKey = "Wrong encryption / decryption key for keyfile '%s'.\n"; + + +EncKeys::EncKeys() { + countKeys = keyLineInKeyFile = 0; + for (int ii = 0; ii < MAX_KEYS; ii++) { + keys[ii].id = 0; + keys[ii].iv = keys[ii].key = NULL; + } + oneKey = NULL; +} + +EncKeys::~EncKeys() { + for (int ii = MAX_KEYS - 1; ii >= 0 ; ii--) { + delete[] keys[ii].iv; keys[ii].iv = NULL; + delete[] keys[ii].key; keys[ii].key = NULL; + + } + delete oneKey; oneKey = NULL; +} + +bool EncKeys::initKeys(const char *name, const char *url, const int initType, const char *filekey) { + if (KEYINITTYPE_FILE == initType) { // url == path && name == filename + if(ERROR_FALSE_FILE_KEY == initKeysThroughFile(name, url, filekey)) return false; + else return true; + } + else if (KEYINITTYPE_SERVER == initType) { + printf(errorNotImplemented); + } + return NO_ERROR_KEY_FILE_PARSE_OK == ERROR_KEYINITTYPE_SERVER_NOT_IMPLEMENTED; +} + +int EncKeys::initKeysThroughFile(const char *name, const char *path, const char *filekey) { + size_t len1 = strlen(path); + size_t len2 = strlen(name); + bool isSlash = ('/' == path[len1 - 1]); + int ret = NO_ERROR_KEY_FILE_PARSE_OK; + char *filename = new char[len1 + len2 + isSlash ? 1 : 2]; + + sprintf(filename, "%s%s%s", path, isSlash ? "" : "/", name); + ret = parseFile((const char *)filename, 254, filekey); + delete[] filename; filename = NULL; + return ret; +} + +/** + * Returns a struct keyentry with the asked 'id' or NULL. + */ +keyentry *EncKeys::getKeys(int id) { + if (KEY_MIN <= id && KEY_MAX >= id && (oneKey = &keys[id - 1])->iv) + return oneKey; + else { + printf(errorNoKeyId, id); + return NULL; + } +} + +/** + * Get the keys from the key file 'filename' and decrypt it with the key 'secret'. + * Store the keys with id smaller then 'maxKeyId' in an array of structs keyentry. + * Returns NO_ERROR_PARSE_OK or an appropriate error code. + */ +int EncKeys::parseFile(const char* filename, const uint maxKeyId, const char *secret) { + int errorCode = 0; + char *buffer = decryptFile(filename, secret, &errorCode); + + if (NO_ERROR_PARSE_OK != errorCode) return errorCode; + else errorCode = NO_ERROR_KEY_FILE_PARSE_OK; + + char *line = strtok(buffer, newLine); + while ( NULL != line) { + keyLineInKeyFile++; + switch (parseLine(line, maxKeyId)) { + case NO_ERROR_PARSE_OK: + keys[oneKey->id - 1] = *oneKey; + countKeys++; + printf("Line: %u --> ", keyLineInKeyFile); printKeyEntry(oneKey->id); + break; + case ERROR_ID_TOO_BIG: + printf(errorExceedKeySize, KEY_MAX, keyLineInKeyFile); + printf(" --> %s\n", line); + errorCode = ERROR_KEY_FILE_EXCEEDS_MAX_NUMBERS_OF_KEYS; + break; + case ERROR_NOINITIALIZEDKEY: + printf(errorNoInitializedKey); + printf(" --> %s\n", line); + errorCode = ERROR_KEY_FILE_PARSE_NULL; + break; + case ERROR_WRONG_NUMBER_OF_MATCHES: + printf(errorInMatches, keyLineInKeyFile); + printf(" --> %s\n", line); + errorCode = ERROR_KEY_FILE_PARSE_NULL; + break; + case NO_ERROR_KEY_GREATER_THAN_ASKED: + printf("No asked key in line %u: %s\n", keyLineInKeyFile, line); + break; + case NO_ERROR_ISCOMMENT: + printf("Is comment in line %u: %s\n", keyLineInKeyFile, line); + default: + break; + } + line = strtok(NULL, newLine); + } + + free(line); line = NULL; + delete[] buffer; buffer = NULL; + return errorCode; +} + +int EncKeys::parseLine(const char *line, const uint maxKeyId) { + int ret = NO_ERROR_PARSE_OK; + if (isComment(line)) + ret = NO_ERROR_ISCOMMENT; + else { + const char *error_p; + int offset; + static const pcre *pattern = pcre_compile( + "([0-9]+);([0-9,a-f,A-F]{32});([0-9,a-f,A-F]{64}|[0-9,a-f,A-F]{48}|[0-9,a-f,A-F]{32})", + 0, &error_p, &offset, NULL); + if ( NULL != error_p) + fprintf(stderr, "Error: %s\nOffset: %d\n", error_p, offset); + + int m_len = (int) strlen(line), ovector[MAX_OFFSETS_IN_PCRE_PATTERNS]; + int rc = pcre_exec(pattern, NULL, line, m_len, 0, 0, ovector, MAX_OFFSETS_IN_PCRE_PATTERNS); + if (4 == rc) { + char lin[MAX_KEY_LINE_SIZE + 1]; + strncpy( lin, line, MAX_KEY_LINE_SIZE); + lin[MAX_KEY_LINE_SIZE] = '\0'; + char *substring_start = lin + ovector[2]; + int substr_length = ovector[3] - ovector[2]; + if (3 < substr_length) + ret = ERROR_ID_TOO_BIG; + else { + char buffer[4]; + sprintf(buffer, "%.*s", substr_length, substring_start); + uint id = atoi(buffer); + if (0 == id) ret = ERROR_NOINITIALIZEDKEY; + else if (KEY_MAX < id) ret = ERROR_ID_TOO_BIG; + else if (maxKeyId < id) ret = NO_ERROR_KEY_GREATER_THAN_ASKED; + else { + oneKey = new keyentry; + oneKey->id = id; + substring_start = lin + ovector[4]; + substr_length = ovector[5] - ovector[4]; + oneKey->iv = new char[substr_length + 1]; + sprintf(oneKey->iv, "%.*s", substr_length, substring_start); + substring_start = lin + ovector[6]; + substr_length = ovector[7] - ovector[6]; + oneKey->key = new char[substr_length + 1]; + sprintf(oneKey->key, "%.*s", substr_length, substring_start); + } + } + } + else + ret = ERROR_WRONG_NUMBER_OF_MATCHES; + } + return ret; +} + +/** + * Decrypt the key file 'filename' if it is encrypted with the key 'secret'. + * Store the content of the decrypted file in 'buffer'. The buffer has to be freed + * in the calling function. + */ +char* EncKeys::decryptFile(const char* filename, const char *secret, int *errorCode) { + *errorCode = NO_ERROR_PARSE_OK; + printf("Reading %s\n\n", filename); + FILE *fp = fopen(filename, "r"); + if (NULL == fp) { + printf(errorOpenFile, filename); + *errorCode = ERROR_OPEN_FILE; + return NULL; + } + + if (fseek(fp, 0L, SEEK_END)) { + *errorCode = ERROR_READING_FILE; + return NULL; + } + long file_size = ftell(fp); // get the file size + if (MAX_KEY_FILE_SIZE < file_size) { + printf(errorExceedKeyFileSize, filename, MAX_KEY_FILE_SIZE); + *errorCode = ERROR_KEY_FILE_TOO_BIG; + return NULL; + } + else if (-1L == file_size) { + printf(errorFileSize, filename); + *errorCode = ERROR_READING_FILE; + return NULL; + } + + fseek(fp, 0L, SEEK_SET); + //Read file into buffer + char *buffer = new char[file_size + 1]; + fread(buffer, file_size, 1, fp); + buffer[file_size] = '\0'; + fclose(fp); + + //Check for file encryption + if (0 == memcmp(buffer, strMAGIC, magicSize)) { //If file is encrypted, decrypt it first. + unsigned char salt[magicSize + 1]; + unsigned char *key = new unsigned char[keySize32]; + unsigned char *iv = new unsigned char[ivSize16]; + char *decrypted = new char[file_size]; + memcpy(&salt, buffer + magicSize, magicSize); + salt[magicSize] = '\0'; + my_bytes_to_key((unsigned char *) salt, secret, key, iv); + unsigned long int d_size = 0; + int res = my_aes_decrypt_cbc(buffer + 2 * magicSize, file_size - 2 * magicSize, + decrypted, &d_size, key, keySize32, iv, ivSize16); + if(0 != res) { + *errorCode = ERROR_FALSE_FILE_KEY; + delete[] buffer; buffer = NULL; + printf(errorFalseFileKey, secret); + } + else { + memcpy(buffer, decrypted, d_size); + buffer[d_size] = '\0'; + } + + delete[] decrypted; decrypted = NULL; + delete[] key; key = NULL; + delete[] iv; iv = NULL; + } + return buffer; +} + +bool EncKeys::isComment(const char *line) { + const char *error_p; + int offset, m_len = (int) strlen(line), ovector[MAX_OFFSETS_IN_PCRE_PATTERNS]; + static const pcre *pattern = pcre_compile("\\s*#.*", 0, &error_p, &offset, NULL); + int rc = pcre_exec( pattern, NULL, line, m_len, 0, 0, ovector, MAX_OFFSETS_IN_PCRE_PATTERNS); + if (0 > rc) return false; + else return true; +} + + +void EncKeys::printKeyEntry( uint id) +{ + keyentry *entry = getKeys(id); + if( NULL == entry) printf("No such keyID = %u\n", id); + else printf("Key: id:%3u \tiv:%s \tkey:%s\n", entry->id, entry->iv, entry->key); +} diff --git a/storage/xtradb/enc/KeySingleton.cc b/storage/xtradb/enc/KeySingleton.cc new file mode 100644 index 0000000000000..6a3ea1908c41e --- /dev/null +++ b/storage/xtradb/enc/KeySingleton.cc @@ -0,0 +1,58 @@ +/* Copyright (C) 2014 eperi GmbH. All Rights Reserved. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ + +/******************************************************************//** +@file KeySingleton.cc +Implementation of single pattern to keep keys for encrypting/decrypting pages. + +Created 09/13/2014 Florin Fugaciu +***********************************************************************/ + + +#include "KeySingleton.h" +#include + + +bool KeySingleton::instanceInited = false; +KeySingleton KeySingleton::theInstance; +EncKeys KeySingleton::encKeys; + + + +KeySingleton & KeySingleton::getInstance() { + if( !instanceInited) { + printf("Encryption / decryption keys were not initialized. " + "You can not read encrypted tables or columns\n\n"); + } + return theInstance; +} + +KeySingleton & KeySingleton::getInstance(const char *name, const char *url, + const int initType, const char *filekey) { + if(instanceInited) return theInstance; + + instanceInited = encKeys.initKeys(name, url, initType, filekey); + if( !instanceInited) { + printf("Could not initialize any of the encryption / decryption keys. " + "You can not read encrypted tables or columns\n\n"); + } + + return theInstance; +} + +keyentry *KeySingleton::getKeys(int id) { + return encKeys.getKeys(id); +} + diff --git a/storage/xtradb/enc/keyfile.c b/storage/xtradb/enc/keyfile.c new file mode 100644 index 0000000000000..1afed215666f2 --- /dev/null +++ b/storage/xtradb/enc/keyfile.c @@ -0,0 +1,178 @@ +/* Copyright (C) 2014 eperi GmbH. All Rights Reserved. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ + +/******************************************************************/ +//Author Clemens Doerrhoefer + +#include +#include +#include +#include +#include + +#define E_WRONG_NUMBER_OF_MATCHES 10 +#define MAX_KEY_FILE_SIZE 1048576 +#define MAX_BUFFER_LENGTH 512 + +#define KEY_FILE_PARSE_OK 0 +#define KEY_FILE_TOO_BIG 100 +#define KEY_BUFFER_TOO_BIG 200 +#define KEY_FILE_PARSE_NULL 300 +#define KEY_FILE_TOO_MANY_KEYS 400 + + +int +isComment(char *line) +{ + const char *error_p; + int offset; + int m_len = (int) strlen(line); + + pcre *pattern = pcre_compile( + "\\s*#.*", + 0, + &error_p, + &offset, + NULL); + int rc,i; + int ovector[30]; + rc = pcre_exec( + pattern, + NULL, + line, + m_len, + 0, + 0, + ovector, + 30 + ); + if(rc < 0) { + return 0; + } else { + return 1; + } +} + +int +parseFile(FILE * fp, struct keyentry **allKeys, const int k_len, const char *secret) +{ + const char *MAGIC = "Salted__"; + long file_size = 0; + char *buffer, *decrypted; + char *line = NULL; + if(NULL == fp) { + fprintf(stderr, "Key file not found.\n"); + return 100; + } + + //get size of file + fseek(fp, 0L, SEEK_END); + file_size = ftell(fp); + fseek(fp, 0L, SEEK_SET); + + if(file_size > MAX_KEY_FILE_SIZE) { + return KEY_FILE_TOO_BIG; + } + + //Read file into buffer + buffer = (char*) malloc((file_size+1)*sizeof(char)); + fread(buffer, file_size, 1, fp); + + //Check for file encryption + if(memcmp(buffer, MAGIC, 8) == 0) { //If file is encrypted, decrypt it first. + unsigned char salt[8]; + unsigned char *key = malloc(32 * sizeof(char)); + unsigned char *iv = malloc(16 * sizeof(char)); + decrypted = malloc(file_size * sizeof(char)); + memcpy(&salt, buffer+8, 8); + my_bytes_to_key(&salt, secret, key, iv); + unsigned long int d_size = 0; + my_aes_decrypt_cbc(buffer + 16, file_size -16, decrypted, &d_size, key, 32, iv, 16); + memcpy(buffer, decrypted, d_size); + + free(decrypted); + free(key); + free(iv); + } + + line = strtok(buffer, "\n"); + while(line != NULL) { + struct keyentry *entry = (struct keyentry*) malloc(sizeof(struct keyentry)); + if( parseLine(line, entry, k_len) == 0) { + allKeys[entry->id] = entry; + } + line = strtok(NULL, "\n"); + } + free(buffer); + return KEY_FILE_PARSE_OK; +} + +int +parseLine(const char *line, struct keyentry *entry, const int k_len) +{ + const char *error_p; + int offset; + + pcre *pattern = pcre_compile( + "([0-9]+);([0-9,a-f,A-F]{32});([0-9,a-f,A-F]{64}|[0-9,a-f,A-F]{48}|[0-9,a-f,A-F]{32})", + 0, + &error_p, + &offset, + NULL); + if( error_p != NULL ) { + fprintf(stderr, "Error: %s\n", error_p); + fprintf(stderr, "Offset: %d\n", offset); + } + int m_len = (int) strlen(line); + char *buffer = (char*) malloc(MAX_BUFFER_LENGTH*sizeof(char)); + int rc,i; + int ovector[30]; + rc = pcre_exec( + pattern, + NULL, + line, + m_len, + 0, + 0, + ovector, + 30 + ); + if(rc == 4 && !isComment(line)) { + char *substring_start = line + ovector[2]; + int substr_length = ovector[3] - ovector[2]; + sprintf( buffer, "%.*s", substr_length, substring_start ); + entry->id = atoi(buffer); + if(entry->id >= k_len) + return KEY_FILE_TOO_MANY_KEYS; + + substring_start = line + ovector[4]; + substr_length = ovector[5] - ovector[4]; + entry->iv = malloc(substr_length*sizeof(char)); + + sprintf( entry->iv, "%.*s", substr_length, substring_start ); + + substring_start = line + ovector[6]; + substr_length = ovector[7] - ovector[6]; + entry->key = malloc(substr_length*sizeof(char)); + sprintf( entry->key, "%.*s", substr_length, substring_start ); + } else + { + return E_WRONG_NUMBER_OF_MATCHES; + } + if(entry->id == NULL || entry->iv == NULL || entry->key == NULL) { + return KEY_FILE_PARSE_NULL; + } + return KEY_FILE_PARSE_OK; +} diff --git a/storage/xtradb/fil/fil0fil.cc b/storage/xtradb/fil/fil0fil.cc index 017e96e611154..e8a59622b67d0 100644 --- a/storage/xtradb/fil/fil0fil.cc +++ b/storage/xtradb/fil/fil0fil.cc @@ -56,6 +56,10 @@ Created 10/25/1995 Heikki Tuuri static ulint srv_data_read, srv_data_written; #endif /* !UNIV_HOTBACKUP */ #include "fil0pagecompress.h" + +#include "fil0pageencryption.h" +#include "fsp0pageencryption.h" + #include "zlib.h" #ifdef __linux__ #include @@ -750,7 +754,10 @@ fil_node_open_file( ut_ad(mutex_own(&(system->mutex))); ut_a(node->n_pending == 0); ut_a(node->open == FALSE); - + if (strcmp(node->name,"test/b")==0) { + fprintf(stderr,"file access: %s", node->name); + fflush(stderr); + } if (node->size == 0) { /* It must be a single-table tablespace and we do not know the size of the file yet. First we open the file in the normal @@ -2140,7 +2147,9 @@ fil_read_first_page( /* Align the memory for a possible read from a raw device */ page = static_cast(ut_align(buf, UNIV_PAGE_SIZE)); - +if (orig_space_id==18446744073709551615) { + //return NULL; +} os_file_read(data_file, page, 0, UNIV_PAGE_SIZE, orig_space_id != ULINT_UNDEFINED ? fil_space_is_page_compressed(orig_space_id) : @@ -2154,9 +2163,9 @@ fil_read_first_page( ulint write_size=0; fil_decompress_page(NULL, page, UNIV_PAGE_SIZE, &write_size); } - *space_id = fsp_header_get_space_id(page); + flushed_lsn = mach_read_from_8(page + FIL_PAGE_FILE_FLUSH_LSN); if (!one_read_already) { @@ -2586,7 +2595,7 @@ static ulint fil_check_pending_io( /*=================*/ - fil_space_t* space, /*!< in/out: Tablespace to check */ + fil_space_t* space, /*!< in/out: Tablespace to chemismatchck */ fil_node_t** node, /*!< out: Node in space list */ ulint count) /*!< in: number of attempts so far */ { @@ -5267,7 +5276,7 @@ fil_extend_space_to_desired_size( success = os_aio(OS_FILE_WRITE, OS_AIO_SYNC, node->name, node->handle, buf, offset, page_size * n_pages, - NULL, NULL, space_id, NULL, 0, 0, 0); + NULL, NULL, space_id, NULL, 0, 0, 0, 0, 0); #endif /* UNIV_HOTBACKUP */ if (success) { os_has_said_disk_full = FALSE; @@ -5660,6 +5669,9 @@ _fil_io( ibool ignore_nonexistent_pages; ibool page_compressed = FALSE; ulint page_compression_level = 0; + ibool page_encrypted = FALSE; + ulint page_encryption_key = 0; + is_log = type & OS_FILE_LOG; type = type & ~OS_FILE_LOG; @@ -5729,6 +5741,11 @@ _fil_io( page_compressed = fsp_flags_is_page_compressed(space->flags); page_compression_level = fsp_flags_get_page_compression_level(space->flags); + + page_encrypted = fsp_flags_is_page_encrypted(space->flags); + page_encryption_key = fsp_flags_get_page_encryption_key(space->flags); + + /* If we are deleting a tablespace we don't allow any read operations on that. However, we do allow write operations. */ if (space == 0 || (type == OS_FILE_READ && space->stop_new_ops)) { @@ -5873,9 +5890,8 @@ _fil_io( } /* Queue the aio request */ - ret = os_aio(type, mode | wake_later, node->name, node->handle, buf, - offset, len, node, message, space_id, trx, - page_compressed, page_compression_level, write_size); + ret = os_aio(type, mode | wake_later, node->name, node->handle, buf, + offset, len, node, message, space_id, trx, page_compressed, page_compression_level, write_size, page_encrypted, page_encryption_key); #else /* In ibbackup do normal i/o, not aio */ diff --git a/storage/xtradb/fil/fil0fil.cc.orig b/storage/xtradb/fil/fil0fil.cc.orig new file mode 100644 index 0000000000000..96a80aaab6be0 --- /dev/null +++ b/storage/xtradb/fil/fil0fil.cc.orig @@ -0,0 +1,6885 @@ +/***************************************************************************** + +Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2013, 2014, SkySQL Ab. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/**************************************************//** +@file fil/fil0fil.cc +The tablespace memory cache + +Created 10/25/1995 Heikki Tuuri +*******************************************************/ + +#include "fil0fil.h" + +#include +#include + +#include "mem0mem.h" +#include "hash0hash.h" +#include "os0file.h" +#include "mach0data.h" +#include "buf0buf.h" +#include "buf0flu.h" +#include "log0recv.h" +#include "fsp0fsp.h" +#include "srv0srv.h" +#include "srv0start.h" +#include "mtr0mtr.h" +#include "mtr0log.h" +#include "dict0dict.h" +#include "page0page.h" +#include "page0zip.h" +#include "trx0sys.h" +#include "row0mysql.h" +#ifndef UNIV_HOTBACKUP +# include "buf0lru.h" +# include "ibuf0ibuf.h" +# include "sync0sync.h" +# include "os0sync.h" +#else /* !UNIV_HOTBACKUP */ +# include "srv0srv.h" +static ulint srv_data_read, srv_data_written; +#endif /* !UNIV_HOTBACKUP */ +#include "fil0pagecompress.h" + +#include "fil0pageencryption.h" +#include "fsp0pageencryption.h" + +#include "zlib.h" +#ifdef __linux__ +#include +#include +#include +#endif +#include "row0mysql.h" + +/* + IMPLEMENTATION OF THE TABLESPACE MEMORY CACHE + ============================================= + +The tablespace cache is responsible for providing fast read/write access to +tablespaces and logs of the database. File creation and deletion is done +in other modules which know more of the logic of the operation, however. + +A tablespace consists of a chain of files. The size of the files does not +have to be divisible by the database block size, because we may just leave +the last incomplete block unused. When a new file is appended to the +tablespace, the maximum size of the file is also specified. At the moment, +we think that it is best to extend the file to its maximum size already at +the creation of the file, because then we can avoid dynamically extending +the file when more space is needed for the tablespace. + +A block's position in the tablespace is specified with a 32-bit unsigned +integer. The files in the chain are thought to be catenated, and the block +corresponding to an address n is the nth block in the catenated file (where +the first block is named the 0th block, and the incomplete block fragments +at the end of files are not taken into account). A tablespace can be extended +by appending a new file at the end of the chain. + +Our tablespace concept is similar to the one of Oracle. + +To acquire more speed in disk transfers, a technique called disk striping is +sometimes used. This means that logical block addresses are divided in a +round-robin fashion across several disks. Windows NT supports disk striping, +so there we do not need to support it in the database. Disk striping is +implemented in hardware in RAID disks. We conclude that it is not necessary +to implement it in the database. Oracle 7 does not support disk striping, +either. + +Another trick used at some database sites is replacing tablespace files by +raw disks, that is, the whole physical disk drive, or a partition of it, is +opened as a single file, and it is accessed through byte offsets calculated +from the start of the disk or the partition. This is recommended in some +books on database tuning to achieve more speed in i/o. Using raw disk +certainly prevents the OS from fragmenting disk space, but it is not clear +if it really adds speed. We measured on the Pentium 100 MHz + NT + NTFS file +system + EIDE Conner disk only a negligible difference in speed when reading +from a file, versus reading from a raw disk. + +To have fast access to a tablespace or a log file, we put the data structures +to a hash table. Each tablespace and log file is given an unique 32-bit +identifier. + +Some operating systems do not support many open files at the same time, +though NT seems to tolerate at least 900 open files. Therefore, we put the +open files in an LRU-list. If we need to open another file, we may close the +file at the end of the LRU-list. When an i/o-operation is pending on a file, +the file cannot be closed. We take the file nodes with pending i/o-operations +out of the LRU-list and keep a count of pending operations. When an operation +completes, we decrement the count and return the file node to the LRU-list if +the count drops to zero. */ + +/** When mysqld is run, the default directory "." is the mysqld datadir, +but in the MySQL Embedded Server Library and ibbackup it is not the default +directory, and we must set the base file path explicitly */ +UNIV_INTERN const char* fil_path_to_mysql_datadir = "."; + +/** The number of fsyncs done to the log */ +UNIV_INTERN ulint fil_n_log_flushes = 0; + +/** Number of pending redo log flushes */ +UNIV_INTERN ulint fil_n_pending_log_flushes = 0; +/** Number of pending tablespace flushes */ +UNIV_INTERN ulint fil_n_pending_tablespace_flushes = 0; + +/** Number of files currently open */ +UNIV_INTERN ulint fil_n_file_opened = 0; + +/** The null file address */ +UNIV_INTERN fil_addr_t fil_addr_null = {FIL_NULL, 0}; + +#ifdef UNIV_PFS_MUTEX +/* Key to register fil_system_mutex with performance schema */ +UNIV_INTERN mysql_pfs_key_t fil_system_mutex_key; +#endif /* UNIV_PFS_MUTEX */ + +#ifdef UNIV_PFS_RWLOCK +/* Key to register file space latch with performance schema */ +UNIV_INTERN mysql_pfs_key_t fil_space_latch_key; +#endif /* UNIV_PFS_RWLOCK */ + +/** File node of a tablespace or the log data space */ +struct fil_node_t { + fil_space_t* space; /*!< backpointer to the space where this node + belongs */ + char* name; /*!< path to the file */ + ibool open; /*!< TRUE if file open */ + os_file_t handle; /*!< OS handle to the file, if file open */ + os_event_t sync_event;/*!< Condition event to group and + serialize calls to fsync */ + ibool is_raw_disk;/*!< TRUE if the 'file' is actually a raw + device or a raw disk partition */ + ulint size; /*!< size of the file in database pages, 0 if + not known yet; the possible last incomplete + megabyte may be ignored if space == 0 */ + ulint n_pending; + /*!< count of pending i/o's on this file; + closing of the file is not allowed if + this is > 0 */ + ulint n_pending_flushes; + /*!< count of pending flushes on this file; + closing of the file is not allowed if + this is > 0 */ + ibool being_extended; + /*!< TRUE if the node is currently + being extended. */ + ib_int64_t modification_counter;/*!< when we write to the file we + increment this by one */ + ib_int64_t flush_counter;/*!< up to what + modification_counter value we have + flushed the modifications to disk */ + UT_LIST_NODE_T(fil_node_t) chain; + /*!< link field for the file chain */ + UT_LIST_NODE_T(fil_node_t) LRU; + /*!< link field for the LRU list */ + ulint magic_n;/*!< FIL_NODE_MAGIC_N */ +}; + +/** Value of fil_node_t::magic_n */ +#define FIL_NODE_MAGIC_N 89389 + +/** Tablespace or log data space: let us call them by a common name space */ +struct fil_space_t { + char* name; /*!< space name = the path to the first file in + it */ + ulint id; /*!< space id */ + ib_int64_t tablespace_version; + /*!< in DISCARD/IMPORT this timestamp + is used to check if we should ignore + an insert buffer merge request for a + page because it actually was for the + previous incarnation of the space */ + ibool mark; /*!< this is set to TRUE at database startup if + the space corresponds to a table in the InnoDB + data dictionary; so we can print a warning of + orphaned tablespaces */ + ibool stop_ios;/*!< TRUE if we want to rename the + .ibd file of tablespace and want to + stop temporarily posting of new i/o + requests on the file */ + ibool stop_new_ops; + /*!< we set this TRUE when we start + deleting a single-table tablespace. + When this is set following new ops + are not allowed: + * read IO request + * ibuf merge + * file flush + Note that we can still possibly have + new write operations because we don't + check this flag when doing flush + batches. */ + ulint purpose;/*!< FIL_TABLESPACE, FIL_LOG, or + FIL_ARCH_LOG */ + UT_LIST_BASE_NODE_T(fil_node_t) chain; + /*!< base node for the file chain */ + ulint size; /*!< space size in pages; 0 if a single-table + tablespace whose size we do not know yet; + last incomplete megabytes in data files may be + ignored if space == 0 */ + ulint flags; /*!< tablespace flags; see + fsp_flags_is_valid(), + fsp_flags_get_zip_size() */ + ulint n_reserved_extents; + /*!< number of reserved free extents for + ongoing operations like B-tree page split */ + ulint n_pending_flushes; /*!< this is positive when flushing + the tablespace to disk; dropping of the + tablespace is forbidden if this is positive */ + ulint n_pending_ops;/*!< this is positive when we + have pending operations against this + tablespace. The pending operations can + be ibuf merges or lock validation code + trying to read a block. + Dropping of the tablespace is forbidden + if this is positive */ + hash_node_t hash; /*!< hash chain node */ + hash_node_t name_hash;/*!< hash chain the name_hash table */ +#ifndef UNIV_HOTBACKUP + prio_rw_lock_t latch; /*!< latch protecting the file space storage + allocation */ +#endif /* !UNIV_HOTBACKUP */ + UT_LIST_NODE_T(fil_space_t) unflushed_spaces; + /*!< list of spaces with at least one unflushed + file we have written to */ + bool is_in_unflushed_spaces; + /*!< true if this space is currently in + unflushed_spaces */ + ibool is_corrupt; + UT_LIST_NODE_T(fil_space_t) space_list; + /*!< list of all spaces */ + ulint magic_n;/*!< FIL_SPACE_MAGIC_N */ +}; + +/** Value of fil_space_t::magic_n */ +#define FIL_SPACE_MAGIC_N 89472 + +/** The tablespace memory cache; also the totality of logs (the log +data space) is stored here; below we talk about tablespaces, but also +the ib_logfiles form a 'space' and it is handled here */ +struct fil_system_t { +#ifndef UNIV_HOTBACKUP + ib_mutex_t mutex; /*!< The mutex protecting the cache */ +#endif /* !UNIV_HOTBACKUP */ + hash_table_t* spaces; /*!< The hash table of spaces in the + system; they are hashed on the space + id */ + hash_table_t* name_hash; /*!< hash table based on the space + name */ + UT_LIST_BASE_NODE_T(fil_node_t) LRU; + /*!< base node for the LRU list of the + most recently used open files with no + pending i/o's; if we start an i/o on + the file, we first remove it from this + list, and return it to the start of + the list when the i/o ends; + log files and the system tablespace are + not put to this list: they are opened + after the startup, and kept open until + shutdown */ + UT_LIST_BASE_NODE_T(fil_space_t) unflushed_spaces; + /*!< base node for the list of those + tablespaces whose files contain + unflushed writes; those spaces have + at least one file node where + modification_counter > flush_counter */ + ulint n_open; /*!< number of files currently open */ + ulint max_n_open; /*!< n_open is not allowed to exceed + this */ + ib_int64_t modification_counter;/*!< when we write to a file we + increment this by one */ + ulint max_assigned_id;/*!< maximum space id in the existing + tables, or assigned during the time + mysqld has been up; at an InnoDB + startup we scan the data dictionary + and set here the maximum of the + space id's of the tables there */ + ib_int64_t tablespace_version; + /*!< a counter which is incremented for + every space object memory creation; + every space mem object gets a + 'timestamp' from this; in DISCARD/ + IMPORT this is used to check if we + should ignore an insert buffer merge + request */ + UT_LIST_BASE_NODE_T(fil_space_t) space_list; + /*!< list of all file spaces */ + ibool space_id_reuse_warned; + /* !< TRUE if fil_space_create() + has issued a warning about + potential space_id reuse */ +}; + +/** The tablespace memory cache. This variable is NULL before the module is +initialized. */ +static fil_system_t* fil_system = NULL; + +/** Determine if (i) is a user tablespace id or not. */ +# define fil_is_user_tablespace_id(i) ((i) > srv_undo_tablespaces_open) + +/** Determine if user has explicitly disabled fsync(). */ +#ifndef __WIN__ +# define fil_buffering_disabled(s) \ + (((s)->purpose == FIL_TABLESPACE \ + && srv_unix_file_flush_method == SRV_UNIX_O_DIRECT_NO_FSYNC)\ + || ((s)->purpose == FIL_LOG \ + && srv_unix_file_flush_method == SRV_UNIX_ALL_O_DIRECT)) + +#else /* __WIN__ */ +# define fil_buffering_disabled(s) (0) +#endif /* __WIN__ */ + +#ifdef UNIV_DEBUG +/** Try fil_validate() every this many times */ +# define FIL_VALIDATE_SKIP 17 + +/******************************************************************//** +Checks the consistency of the tablespace cache some of the time. +@return TRUE if ok or the check was skipped */ +static +ibool +fil_validate_skip(void) +/*===================*/ +{ + /** The fil_validate() call skip counter. Use a signed type + because of the race condition below. */ + static int fil_validate_count = FIL_VALIDATE_SKIP; + + /* There is a race condition below, but it does not matter, + because this call is only for heuristic purposes. We want to + reduce the call frequency of the costly fil_validate() check + in debug builds. */ + if (--fil_validate_count > 0) { + return(TRUE); + } + + fil_validate_count = FIL_VALIDATE_SKIP; + return(fil_validate()); +} +#endif /* UNIV_DEBUG */ + +/********************************************************************//** +Determines if a file node belongs to the least-recently-used list. +@return TRUE if the file belongs to fil_system->LRU mutex. */ +UNIV_INLINE +ibool +fil_space_belongs_in_lru( +/*=====================*/ + const fil_space_t* space) /*!< in: file space */ +{ + return(space->purpose == FIL_TABLESPACE + && fil_is_user_tablespace_id(space->id)); +} + +/********************************************************************//** +NOTE: you must call fil_mutex_enter_and_prepare_for_io() first! + +Prepares a file node for i/o. Opens the file if it is closed. Updates the +pending i/o's field in the node and the system appropriately. Takes the node +off the LRU list if it is in the LRU list. The caller must hold the fil_sys +mutex. +@return false if the file can't be opened, otherwise true */ +static +bool +fil_node_prepare_for_io( +/*====================*/ + fil_node_t* node, /*!< in: file node */ + fil_system_t* system, /*!< in: tablespace memory cache */ + fil_space_t* space); /*!< in: space */ +/********************************************************************//** +Updates the data structures when an i/o operation finishes. Updates the +pending i/o's field in the node appropriately. */ +static +void +fil_node_complete_io( +/*=================*/ + fil_node_t* node, /*!< in: file node */ + fil_system_t* system, /*!< in: tablespace memory cache */ + ulint type); /*!< in: OS_FILE_WRITE or OS_FILE_READ; marks + the node as modified if + type == OS_FILE_WRITE */ +/*******************************************************************//** +Frees a space object from the tablespace memory cache. Closes the files in +the chain but does not delete them. There must not be any pending i/o's or +flushes on the files. +@return TRUE on success */ +static +ibool +fil_space_free( +/*===========*/ + ulint id, /* in: space id */ + ibool x_latched); /* in: TRUE if caller has space->latch + in X mode */ +/********************************************************************//** +Reads data from a space to a buffer. Remember that the possible incomplete +blocks at the end of file are ignored: they are not taken into account when +calculating the byte offset within a space. +@return DB_SUCCESS, or DB_TABLESPACE_DELETED if we are trying to do +i/o on a tablespace which does not exist */ +UNIV_INLINE +dberr_t +fil_read( +/*=====*/ + bool sync, /*!< in: true if synchronous aio is desired */ + ulint space_id, /*!< in: space id */ + ulint zip_size, /*!< in: compressed page size in bytes; + 0 for uncompressed pages */ + ulint block_offset, /*!< in: offset in number of blocks */ + ulint byte_offset, /*!< in: remainder of offset in bytes; in aio + this must be divisible by the OS block size */ + ulint len, /*!< in: how many bytes to read; this must not + cross a file boundary; in aio this must be a + block size multiple */ + void* buf, /*!< in/out: buffer where to store data read; + in aio this must be appropriately aligned */ + void* message, /*!< in: message for aio handler if non-sync + aio used, else ignored */ + ulint* write_size) /*!< in/out: Actual write size initialized + after fist successfull trim + operation for this page and if + initialized we do not trim again if + actual page size does not decrease. */ +{ + return(fil_io(OS_FILE_READ, sync, space_id, zip_size, block_offset, + byte_offset, len, buf, message, write_size)); +} + +/********************************************************************//** +Writes data to a space from a buffer. Remember that the possible incomplete +blocks at the end of file are ignored: they are not taken into account when +calculating the byte offset within a space. +@return DB_SUCCESS, or DB_TABLESPACE_DELETED if we are trying to do +i/o on a tablespace which does not exist */ +UNIV_INLINE +dberr_t +fil_write( +/*======*/ + bool sync, /*!< in: true if synchronous aio is desired */ + ulint space_id, /*!< in: space id */ + ulint zip_size, /*!< in: compressed page size in bytes; + 0 for uncompressed pages */ + ulint block_offset, /*!< in: offset in number of blocks */ + ulint byte_offset, /*!< in: remainder of offset in bytes; in aio + this must be divisible by the OS block size */ + ulint len, /*!< in: how many bytes to write; this must + not cross a file boundary; in aio this must + be a block size multiple */ + void* buf, /*!< in: buffer from which to write; in aio + this must be appropriately aligned */ + void* message, /*!< in: message for aio handler if non-sync + aio used, else ignored */ + ulint* write_size) /*!< in/out: Actual write size initialized + after fist successfull trim + operation for this page and if + initialized we do not trim again if + actual page size does not decrease. */ +{ + ut_ad(!srv_read_only_mode); + + return(fil_io(OS_FILE_WRITE, sync, space_id, zip_size, block_offset, + byte_offset, len, buf, message, write_size)); +} + +/*******************************************************************//** +Returns the table space by a given id, NULL if not found. */ +fil_space_t* +fil_space_get_by_id( +/*================*/ + ulint id) /*!< in: space id */ +{ + fil_space_t* space; + + ut_ad(mutex_own(&fil_system->mutex)); + + HASH_SEARCH(hash, fil_system->spaces, id, + fil_space_t*, space, + ut_ad(space->magic_n == FIL_SPACE_MAGIC_N), + space->id == id); + + return(space); +} + +/****************************************************************//** +Get space id from fil node */ +ulint +fil_node_get_space_id( +/*==================*/ + fil_node_t* node) /*!< in: Compressed node*/ +{ + ut_ad(node); + ut_ad(node->space); + + return (node->space->id); +} + +/*******************************************************************//** +Returns the table space by a given name, NULL if not found. */ +UNIV_INLINE +fil_space_t* +fil_space_get_by_name( +/*==================*/ + const char* name) /*!< in: space name */ +{ + fil_space_t* space; + ulint fold; + + ut_ad(mutex_own(&fil_system->mutex)); + + fold = ut_fold_string(name); + + HASH_SEARCH(name_hash, fil_system->name_hash, fold, + fil_space_t*, space, + ut_ad(space->magic_n == FIL_SPACE_MAGIC_N), + !strcmp(name, space->name)); + + return(space); +} + +#ifndef UNIV_HOTBACKUP +/*******************************************************************//** +Returns the version number of a tablespace, -1 if not found. +@return version number, -1 if the tablespace does not exist in the +memory cache */ +UNIV_INTERN +ib_int64_t +fil_space_get_version( +/*==================*/ + ulint id) /*!< in: space id */ +{ + fil_space_t* space; + ib_int64_t version = -1; + + ut_ad(fil_system); + + mutex_enter(&fil_system->mutex); + + space = fil_space_get_by_id(id); + + if (space) { + version = space->tablespace_version; + } + + mutex_exit(&fil_system->mutex); + + return(version); +} + +/*******************************************************************//** +Returns the latch of a file space. +@return latch protecting storage allocation */ +UNIV_INTERN +prio_rw_lock_t* +fil_space_get_latch( +/*================*/ + ulint id, /*!< in: space id */ + ulint* flags) /*!< out: tablespace flags */ +{ + fil_space_t* space; + + ut_ad(fil_system); + + mutex_enter(&fil_system->mutex); + + space = fil_space_get_by_id(id); + + ut_a(space); + + if (flags) { + *flags = space->flags; + } + + mutex_exit(&fil_system->mutex); + + return(&(space->latch)); +} + +/*******************************************************************//** +Returns the type of a file space. +@return FIL_TABLESPACE or FIL_LOG */ +UNIV_INTERN +ulint +fil_space_get_type( +/*===============*/ + ulint id) /*!< in: space id */ +{ + fil_space_t* space; + + ut_ad(fil_system); + + mutex_enter(&fil_system->mutex); + + space = fil_space_get_by_id(id); + + ut_a(space); + + mutex_exit(&fil_system->mutex); + + return(space->purpose); +} +#endif /* !UNIV_HOTBACKUP */ + +/**********************************************************************//** +Checks if all the file nodes in a space are flushed. The caller must hold +the fil_system mutex. +@return true if all are flushed */ +static +bool +fil_space_is_flushed( +/*=================*/ + fil_space_t* space) /*!< in: space */ +{ + fil_node_t* node; + + ut_ad(mutex_own(&fil_system->mutex)); + + node = UT_LIST_GET_FIRST(space->chain); + + while (node) { + if (node->modification_counter > node->flush_counter) { + + ut_ad(!fil_buffering_disabled(space)); + return(false); + } + + node = UT_LIST_GET_NEXT(chain, node); + } + + return(true); +} + +/*******************************************************************//** +Appends a new file to the chain of files of a space. File must be closed. +@return pointer to the file name, or NULL on error */ +UNIV_INTERN +char* +fil_node_create( +/*============*/ + const char* name, /*!< in: file name (file must be closed) */ + ulint size, /*!< in: file size in database blocks, rounded + downwards to an integer */ + ulint id, /*!< in: space id where to append */ + ibool is_raw) /*!< in: TRUE if a raw device or + a raw disk partition */ +{ + fil_node_t* node; + fil_space_t* space; + + ut_a(fil_system); + ut_a(name); + + mutex_enter(&fil_system->mutex); + + node = static_cast(mem_zalloc(sizeof(fil_node_t))); + + node->name = mem_strdup(name); + + ut_a(!is_raw || srv_start_raw_disk_in_use); + + node->sync_event = os_event_create(); + node->is_raw_disk = is_raw; + node->size = size; + node->magic_n = FIL_NODE_MAGIC_N; + + space = fil_space_get_by_id(id); + + if (!space) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Error: Could not find tablespace %lu for\n" + "InnoDB: file ", (ulong) id); + ut_print_filename(stderr, name); + fputs(" in the tablespace memory cache.\n", stderr); + mem_free(node->name); + + mem_free(node); + + mutex_exit(&fil_system->mutex); + + return(NULL); + } + + space->size += size; + + node->space = space; + + UT_LIST_ADD_LAST(chain, space->chain, node); + + if (id < SRV_LOG_SPACE_FIRST_ID && fil_system->max_assigned_id < id) { + + fil_system->max_assigned_id = id; + } + + mutex_exit(&fil_system->mutex); + + return(node->name); +} + +/********************************************************************//** +Opens a file of a node of a tablespace. The caller must own the fil_system +mutex. +@return false if the file can't be opened, otherwise true */ +static +bool +fil_node_open_file( +/*===============*/ + fil_node_t* node, /*!< in: file node */ + fil_system_t* system, /*!< in: tablespace memory cache */ + fil_space_t* space) /*!< in: space */ +{ + os_offset_t size_bytes; + ibool ret; + ibool success; + byte* buf2; + byte* page; + ulint space_id; + ulint flags=0; + ulint page_size; + ulint atomic_writes=0; + + ut_ad(mutex_own(&(system->mutex))); + ut_a(node->n_pending == 0); + ut_a(node->open == FALSE); + + if (node->size == 0) { + /* It must be a single-table tablespace and we do not know the + size of the file yet. First we open the file in the normal + mode, no async I/O here, for simplicity. Then do some checks, + and close the file again. + NOTE that we could not use the simple file read function + os_file_read() in Windows to read from a file opened for + async I/O! */ + + node->handle = os_file_create_simple_no_error_handling( + innodb_file_data_key, node->name, OS_FILE_OPEN, + OS_FILE_READ_ONLY, &success, 0); + if (!success) { + /* The following call prints an error message */ + os_file_get_last_error(true); + + ut_print_timestamp(stderr); + + ib_logf(IB_LOG_LEVEL_WARN, "InnoDB: Error: cannot " + "open %s\n. InnoDB: Have you deleted .ibd " + "files under a running mysqld server?\n", + node->name); + + return(false); + } + + size_bytes = os_file_get_size(node->handle); + ut_a(size_bytes != (os_offset_t) -1); +#ifdef UNIV_HOTBACKUP + if (space->id == 0) { + node->size = (ulint) (size_bytes / UNIV_PAGE_SIZE); + os_file_close(node->handle); + goto add_size; + } +#endif /* UNIV_HOTBACKUP */ + ut_a(space->purpose != FIL_LOG); + ut_a(fil_is_user_tablespace_id(space->id)); + + if (size_bytes < FIL_IBD_FILE_INITIAL_SIZE * UNIV_PAGE_SIZE) { + fprintf(stderr, + "InnoDB: Error: the size of single-table" + " tablespace file %s\n" + "InnoDB: is only "UINT64PF"," + " should be at least %lu!\n", + node->name, + size_bytes, + (ulong) (FIL_IBD_FILE_INITIAL_SIZE + * UNIV_PAGE_SIZE)); + + ut_a(0); + } + + /* Read the first page of the tablespace */ + + buf2 = static_cast(ut_malloc(2 * UNIV_PAGE_SIZE)); + /* Align the memory for file i/o if we might have O_DIRECT + set */ + page = static_cast(ut_align(buf2, UNIV_PAGE_SIZE)); + + success = os_file_read(node->handle, page, 0, UNIV_PAGE_SIZE, + space->flags); + + space_id = fsp_header_get_space_id(page); + flags = fsp_header_get_flags(page); + page_size = fsp_flags_get_page_size(flags); + atomic_writes = fsp_flags_get_atomic_writes(flags); + + ut_free(buf2); + + /* Close the file now that we have read the space id from it */ + + os_file_close(node->handle); + + if (UNIV_UNLIKELY(space_id != space->id)) { + fprintf(stderr, + "InnoDB: Error: tablespace id is %lu" + " in the data dictionary\n" + "InnoDB: but in file %s it is %lu!\n", + space->id, node->name, space_id); + + ut_error; + } + + if (UNIV_UNLIKELY(space_id == ULINT_UNDEFINED + || space_id == 0)) { + fprintf(stderr, + "InnoDB: Error: tablespace id %lu" + " in file %s is not sensible\n", + (ulong) space_id, node->name); + + ut_error; + } + + if (UNIV_UNLIKELY(fsp_flags_get_page_size(space->flags) + != page_size)) { + fprintf(stderr, + "InnoDB: Error: tablespace file %s" + " has page size 0x%lx\n" + "InnoDB: but the data dictionary" + " expects page size 0x%lx!\n", + node->name, flags, + fsp_flags_get_page_size(space->flags)); + + ut_error; + } + + if (UNIV_UNLIKELY(space->flags != flags)) { + fprintf(stderr, + "InnoDB: Error: table flags are 0x%lx" + " in the data dictionary\n" + "InnoDB: but the flags in file %s are 0x%lx!\n", + space->flags, node->name, flags); + + ut_error; + } + + if (UNIV_UNLIKELY(space->flags != flags)) { + if (!dict_tf_verify_flags(space->flags, flags)) { + fprintf(stderr, + "InnoDB: Error: table flags are 0x%lx" + " in the data dictionary\n" + "InnoDB: but the flags in file %s are 0x%lx!\n", + space->flags, node->name, flags); + ut_error; + } + } + + if (size_bytes >= FSP_EXTENT_SIZE * UNIV_PAGE_SIZE) { + /* Truncate the size to whole extent size. */ + size_bytes = ut_2pow_round(size_bytes, + FSP_EXTENT_SIZE * + UNIV_PAGE_SIZE); + } + + if (!fsp_flags_is_compressed(flags)) { + node->size = (ulint) + (size_bytes + / fsp_flags_get_page_size(flags)); + } else { + node->size = (ulint) + (size_bytes + / fsp_flags_get_zip_size(flags)); + } + +#ifdef UNIV_HOTBACKUP +add_size: +#endif /* UNIV_HOTBACKUP */ + space->size += node->size; + } + + atomic_writes = fsp_flags_get_atomic_writes(space->flags); + + /* printf("Opening file %s\n", node->name); */ + + /* Open the file for reading and writing, in Windows normally in the + unbuffered async I/O mode, though global variables may make + os_file_create() to fall back to the normal file I/O mode. */ + + if (space->purpose == FIL_LOG) { + node->handle = os_file_create(innodb_file_log_key, + node->name, OS_FILE_OPEN, + OS_FILE_AIO, OS_LOG_FILE, + &ret, atomic_writes); + } else if (node->is_raw_disk) { + node->handle = os_file_create(innodb_file_data_key, + node->name, + OS_FILE_OPEN_RAW, + OS_FILE_AIO, OS_DATA_FILE, + &ret, atomic_writes); + } else { + node->handle = os_file_create(innodb_file_data_key, + node->name, OS_FILE_OPEN, + OS_FILE_AIO, OS_DATA_FILE, + &ret, atomic_writes); + } + + ut_a(ret); + + node->open = TRUE; + + system->n_open++; + fil_n_file_opened++; + + if (fil_space_belongs_in_lru(space)) { + + /* Put the node to the LRU list */ + UT_LIST_ADD_FIRST(LRU, system->LRU, node); + } + + return(true); +} + +/**********************************************************************//** +Closes a file. */ +static +void +fil_node_close_file( +/*================*/ + fil_node_t* node, /*!< in: file node */ + fil_system_t* system) /*!< in: tablespace memory cache */ +{ + ibool ret; + + ut_ad(node && system); + ut_ad(mutex_own(&(system->mutex))); + ut_a(node->open); + ut_a(node->n_pending == 0); + ut_a(node->n_pending_flushes == 0); + ut_a(!node->being_extended); +#ifndef UNIV_HOTBACKUP + ut_a(node->modification_counter == node->flush_counter + || srv_fast_shutdown == 2); +#endif /* !UNIV_HOTBACKUP */ + + ret = os_file_close(node->handle); + ut_a(ret); + + /* printf("Closing file %s\n", node->name); */ + + node->open = FALSE; + ut_a(system->n_open > 0); + system->n_open--; + fil_n_file_opened--; + + if (fil_space_belongs_in_lru(node->space)) { + + ut_a(UT_LIST_GET_LEN(system->LRU) > 0); + + /* The node is in the LRU list, remove it */ + UT_LIST_REMOVE(LRU, system->LRU, node); + } +} + +/********************************************************************//** +Tries to close a file in the LRU list. The caller must hold the fil_sys +mutex. +@return TRUE if success, FALSE if should retry later; since i/o's +generally complete in < 100 ms, and as InnoDB writes at most 128 pages +from the buffer pool in a batch, and then immediately flushes the +files, there is a good chance that the next time we find a suitable +node from the LRU list */ +static +ibool +fil_try_to_close_file_in_LRU( +/*=========================*/ + ibool print_info) /*!< in: if TRUE, prints information why it + cannot close a file */ +{ + fil_node_t* node; + + ut_ad(mutex_own(&fil_system->mutex)); + + if (print_info) { + fprintf(stderr, + "InnoDB: fil_sys open file LRU len %lu\n", + (ulong) UT_LIST_GET_LEN(fil_system->LRU)); + } + + for (node = UT_LIST_GET_LAST(fil_system->LRU); + node != NULL; + node = UT_LIST_GET_PREV(LRU, node)) { + + if (node->modification_counter == node->flush_counter + && node->n_pending_flushes == 0 + && !node->being_extended) { + + fil_node_close_file(node, fil_system); + + return(TRUE); + } + + if (!print_info) { + continue; + } + + if (node->n_pending_flushes > 0) { + fputs("InnoDB: cannot close file ", stderr); + ut_print_filename(stderr, node->name); + fprintf(stderr, ", because n_pending_flushes %lu\n", + (ulong) node->n_pending_flushes); + } + + if (node->modification_counter != node->flush_counter) { + fputs("InnoDB: cannot close file ", stderr); + ut_print_filename(stderr, node->name); + fprintf(stderr, + ", because mod_count %ld != fl_count %ld\n", + (long) node->modification_counter, + (long) node->flush_counter); + + } + + if (node->being_extended) { + fputs("InnoDB: cannot close file ", stderr); + ut_print_filename(stderr, node->name); + fprintf(stderr, ", because it is being extended\n"); + } + } + + return(FALSE); +} + +/*******************************************************************//** +Reserves the fil_system mutex and tries to make sure we can open at least one +file while holding it. This should be called before calling +fil_node_prepare_for_io(), because that function may need to open a file. */ +static +void +fil_mutex_enter_and_prepare_for_io( +/*===============================*/ + ulint space_id) /*!< in: space id */ +{ + fil_space_t* space; + ibool success; + ibool print_info = FALSE; + ulint count = 0; + ulint count2 = 0; + +retry: + mutex_enter(&fil_system->mutex); + + if (space_id == 0 || space_id >= SRV_LOG_SPACE_FIRST_ID) { + /* We keep log files and system tablespace files always open; + this is important in preventing deadlocks in this module, as + a page read completion often performs another read from the + insert buffer. The insert buffer is in tablespace 0, and we + cannot end up waiting in this function. */ + + return; + } + + space = fil_space_get_by_id(space_id); + + if (space != NULL && space->stop_ios) { + /* We are going to do a rename file and want to stop new i/o's + for a while */ + + if (count2 > 20000) { + fputs("InnoDB: Warning: tablespace ", stderr); + ut_print_filename(stderr, space->name); + fprintf(stderr, + " has i/o ops stopped for a long time %lu\n", + (ulong) count2); + } + + mutex_exit(&fil_system->mutex); + +#ifndef UNIV_HOTBACKUP + + /* Wake the i/o-handler threads to make sure pending + i/o's are performed */ + os_aio_simulated_wake_handler_threads(); + + /* The sleep here is just to give IO helper threads a + bit of time to do some work. It is not required that + all IO related to the tablespace being renamed must + be flushed here as we do fil_flush() in + fil_rename_tablespace() as well. */ + os_thread_sleep(20000); + +#endif /* UNIV_HOTBACKUP */ + + /* Flush tablespaces so that we can close modified + files in the LRU list */ + fil_flush_file_spaces(FIL_TABLESPACE); + + os_thread_sleep(20000); + + count2++; + + goto retry; + } + + if (fil_system->n_open < fil_system->max_n_open) { + + return; + } + + /* If the file is already open, no need to do anything; if the space + does not exist, we handle the situation in the function which called + this function */ + + if (!space || UT_LIST_GET_FIRST(space->chain)->open) { + + return; + } + + if (count > 1) { + print_info = TRUE; + } + + /* Too many files are open, try to close some */ +close_more: + success = fil_try_to_close_file_in_LRU(print_info); + + if (success && fil_system->n_open >= fil_system->max_n_open) { + + goto close_more; + } + + if (fil_system->n_open < fil_system->max_n_open) { + /* Ok */ + + return; + } + + if (count >= 2) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Warning: too many (%lu) files stay open" + " while the maximum\n" + "InnoDB: allowed value would be %lu.\n" + "InnoDB: You may need to raise the value of" + " innodb_open_files in\n" + "InnoDB: my.cnf.\n", + (ulong) fil_system->n_open, + (ulong) fil_system->max_n_open); + + return; + } + + mutex_exit(&fil_system->mutex); + +#ifndef UNIV_HOTBACKUP + /* Wake the i/o-handler threads to make sure pending i/o's are + performed */ + os_aio_simulated_wake_handler_threads(); + + os_thread_sleep(20000); +#endif + /* Flush tablespaces so that we can close modified files in the LRU + list */ + + fil_flush_file_spaces(FIL_TABLESPACE); + + count++; + + goto retry; +} + +/*******************************************************************//** +Frees a file node object from a tablespace memory cache. */ +static +void +fil_node_free( +/*==========*/ + fil_node_t* node, /*!< in, own: file node */ + fil_system_t* system, /*!< in: tablespace memory cache */ + fil_space_t* space) /*!< in: space where the file node is chained */ +{ + ut_ad(node && system && space); + ut_ad(mutex_own(&(system->mutex))); + ut_a(node->magic_n == FIL_NODE_MAGIC_N); + ut_a(node->n_pending == 0); + ut_a(!node->being_extended); + + if (node->open) { + /* We fool the assertion in fil_node_close_file() to think + there are no unflushed modifications in the file */ + + node->modification_counter = node->flush_counter; + os_event_set(node->sync_event); + + if (fil_buffering_disabled(space)) { + + ut_ad(!space->is_in_unflushed_spaces); + ut_ad(fil_space_is_flushed(space)); + + } else if (space->is_in_unflushed_spaces + && fil_space_is_flushed(space)) { + + space->is_in_unflushed_spaces = false; + + UT_LIST_REMOVE(unflushed_spaces, + system->unflushed_spaces, + space); + } + + fil_node_close_file(node, system); + } + + space->size -= node->size; + + UT_LIST_REMOVE(chain, space->chain, node); + + os_event_free(node->sync_event); + mem_free(node->name); + mem_free(node); +} + +#ifdef UNIV_LOG_ARCHIVE +/****************************************************************//** +Drops files from the start of a file space, so that its size is cut by +the amount given. */ +UNIV_INTERN +void +fil_space_truncate_start( +/*=====================*/ + ulint id, /*!< in: space id */ + ulint trunc_len) /*!< in: truncate by this much; it is an error + if this does not equal to the combined size of + some initial files in the space */ +{ + fil_node_t* node; + fil_space_t* space; + + mutex_enter(&fil_system->mutex); + + space = fil_space_get_by_id(id); + + ut_a(space); + + while (trunc_len > 0) { + node = UT_LIST_GET_FIRST(space->chain); + + ut_a(node->size * UNIV_PAGE_SIZE <= trunc_len); + + trunc_len -= node->size * UNIV_PAGE_SIZE; + + fil_node_free(node, fil_system, space); + } + + mutex_exit(&fil_system->mutex); +} + +/****************************************************************//** +Check is there node in file space with given name. */ +UNIV_INTERN +ibool +fil_space_contains_node( +/*====================*/ + ulint id, /*!< in: space id */ + char* node_name) /*!< in: node name */ +{ + fil_node_t* node; + fil_space_t* space; + + mutex_enter(&fil_system->mutex); + + space = fil_space_get_by_id(id); + + ut_a(space); + + for (node = UT_LIST_GET_FIRST(space->chain); node != NULL; + node = UT_LIST_GET_NEXT(chain, node)) { + + if (ut_strcmp(node->name, node_name) == 0) { + mutex_exit(&fil_system->mutex); + return(TRUE); + } + + } + + mutex_exit(&fil_system->mutex); + return(FALSE); +} + +#endif /* UNIV_LOG_ARCHIVE */ + +/*******************************************************************//** +Creates a space memory object and puts it to the 'fil system' hash table. +If there is an error, prints an error message to the .err log. +@return TRUE if success */ +UNIV_INTERN +ibool +fil_space_create( +/*=============*/ + const char* name, /*!< in: space name */ + ulint id, /*!< in: space id */ + ulint flags, /*!< in: tablespace flags */ + ulint purpose)/*!< in: FIL_TABLESPACE, or FIL_LOG if log */ +{ + fil_space_t* space; + + DBUG_EXECUTE_IF("fil_space_create_failure", return(false);); + + ut_a(fil_system); + + /* Look for a matching tablespace and if found free it. */ + do { + mutex_enter(&fil_system->mutex); + + space = fil_space_get_by_name(name); + + if (space != 0) { + ib_logf(IB_LOG_LEVEL_WARN, + "Tablespace '%s' exists in the cache " + "with id %lu != %lu", + name, (ulong) space->id, (ulong) id); + + if (id == 0 || purpose != FIL_TABLESPACE) { + + mutex_exit(&fil_system->mutex); + + return(FALSE); + } + + ib_logf(IB_LOG_LEVEL_WARN, + "Freeing existing tablespace '%s' entry " + "from the cache with id %lu", + name, (ulong) id); + + ibool success = fil_space_free(space->id, FALSE); + ut_a(success); + + mutex_exit(&fil_system->mutex); + } + + } while (space != 0); + + space = fil_space_get_by_id(id); + + if (space != 0) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Trying to add tablespace '%s' with id %lu " + "to the tablespace memory cache, but tablespace '%s' " + "with id %lu already exists in the cache!", + name, (ulong) id, space->name, (ulong) space->id); + + mutex_exit(&fil_system->mutex); + + return(FALSE); + } + + space = static_cast(mem_zalloc(sizeof(*space))); + + space->name = mem_strdup(name); + space->id = id; + + fil_system->tablespace_version++; + space->tablespace_version = fil_system->tablespace_version; + space->mark = FALSE; + + if (purpose == FIL_TABLESPACE && !recv_recovery_on + && id > fil_system->max_assigned_id) { + + if (!fil_system->space_id_reuse_warned) { + fil_system->space_id_reuse_warned = TRUE; + + ib_logf(IB_LOG_LEVEL_WARN, + "Allocated tablespace %lu, old maximum " + "was %lu", + (ulong) id, + (ulong) fil_system->max_assigned_id); + } + + fil_system->max_assigned_id = id; + } + + space->purpose = purpose; + space->flags = flags; + + space->magic_n = FIL_SPACE_MAGIC_N; + + rw_lock_create(fil_space_latch_key, &space->latch, SYNC_FSP); + + HASH_INSERT(fil_space_t, hash, fil_system->spaces, id, space); + + HASH_INSERT(fil_space_t, name_hash, fil_system->name_hash, + ut_fold_string(name), space); + space->is_in_unflushed_spaces = false; + + space->is_corrupt = FALSE; + + UT_LIST_ADD_LAST(space_list, fil_system->space_list, space); + + mutex_exit(&fil_system->mutex); + + return(TRUE); +} + +/*******************************************************************//** +Assigns a new space id for a new single-table tablespace. This works simply by +incrementing the global counter. If 4 billion id's is not enough, we may need +to recycle id's. +@return TRUE if assigned, FALSE if not */ +UNIV_INTERN +ibool +fil_assign_new_space_id( +/*====================*/ + ulint* space_id) /*!< in/out: space id */ +{ + ulint id; + ibool success; + + mutex_enter(&fil_system->mutex); + + id = *space_id; + + if (id < fil_system->max_assigned_id) { + id = fil_system->max_assigned_id; + } + + id++; + + if (id > (SRV_LOG_SPACE_FIRST_ID / 2) && (id % 1000000UL == 0)) { + ut_print_timestamp(stderr); + fprintf(stderr, + "InnoDB: Warning: you are running out of new" + " single-table tablespace id's.\n" + "InnoDB: Current counter is %lu and it" + " must not exceed %lu!\n" + "InnoDB: To reset the counter to zero" + " you have to dump all your tables and\n" + "InnoDB: recreate the whole InnoDB installation.\n", + (ulong) id, + (ulong) SRV_LOG_SPACE_FIRST_ID); + } + + success = (id < SRV_LOG_SPACE_FIRST_ID); + + if (success) { + *space_id = fil_system->max_assigned_id = id; + } else { + ut_print_timestamp(stderr); + fprintf(stderr, + "InnoDB: You have run out of single-table" + " tablespace id's!\n" + "InnoDB: Current counter is %lu.\n" + "InnoDB: To reset the counter to zero you" + " have to dump all your tables and\n" + "InnoDB: recreate the whole InnoDB installation.\n", + (ulong) id); + *space_id = ULINT_UNDEFINED; + } + + mutex_exit(&fil_system->mutex); + + return(success); +} + +/*******************************************************************//** +Frees a space object from the tablespace memory cache. Closes the files in +the chain but does not delete them. There must not be any pending i/o's or +flushes on the files. +@return TRUE if success */ +static +ibool +fil_space_free( +/*===========*/ + /* out: TRUE if success */ + ulint id, /* in: space id */ + ibool x_latched) /* in: TRUE if caller has space->latch + in X mode */ +{ + fil_space_t* space; + fil_space_t* fnamespace; + + ut_ad(mutex_own(&fil_system->mutex)); + + space = fil_space_get_by_id(id); + + if (!space) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Error: trying to remove tablespace %lu" + " from the cache but\n" + "InnoDB: it is not there.\n", (ulong) id); + + return(FALSE); + } + + HASH_DELETE(fil_space_t, hash, fil_system->spaces, id, space); + + fnamespace = fil_space_get_by_name(space->name); + ut_a(fnamespace); + ut_a(space == fnamespace); + + HASH_DELETE(fil_space_t, name_hash, fil_system->name_hash, + ut_fold_string(space->name), space); + + if (space->is_in_unflushed_spaces) { + + ut_ad(!fil_buffering_disabled(space)); + space->is_in_unflushed_spaces = false; + + UT_LIST_REMOVE(unflushed_spaces, fil_system->unflushed_spaces, + space); + } + + UT_LIST_REMOVE(space_list, fil_system->space_list, space); + + ut_a(space->magic_n == FIL_SPACE_MAGIC_N); + ut_a(0 == space->n_pending_flushes); + + for (fil_node_t* fil_node = UT_LIST_GET_FIRST(space->chain); + fil_node != NULL; + fil_node = UT_LIST_GET_FIRST(space->chain)) { + + fil_node_free(fil_node, fil_system, space); + } + + ut_a(0 == UT_LIST_GET_LEN(space->chain)); + + if (x_latched) { + rw_lock_x_unlock(&space->latch); + } + + rw_lock_free(&(space->latch)); + + mem_free(space->name); + mem_free(space); + + return(TRUE); +} + +/*******************************************************************//** +Returns a pointer to the file_space_t that is in the memory cache +associated with a space id. The caller must lock fil_system->mutex. +@return file_space_t pointer, NULL if space not found */ +UNIV_INLINE +fil_space_t* +fil_space_get_space( +/*================*/ + ulint id) /*!< in: space id */ +{ + fil_space_t* space; + fil_node_t* node; + + ut_ad(fil_system); + + space = fil_space_get_by_id(id); + if (space == NULL) { + return(NULL); + } + + if (space->size == 0 && space->purpose == FIL_TABLESPACE) { + ut_a(id != 0); + + mutex_exit(&fil_system->mutex); + + /* It is possible that the space gets evicted at this point + before the fil_mutex_enter_and_prepare_for_io() acquires + the fil_system->mutex. Check for this after completing the + call to fil_mutex_enter_and_prepare_for_io(). */ + fil_mutex_enter_and_prepare_for_io(id); + + /* We are still holding the fil_system->mutex. Check if + the space is still in memory cache. */ + space = fil_space_get_by_id(id); + if (space == NULL) { + return(NULL); + } + + /* The following code must change when InnoDB supports + multiple datafiles per tablespace. */ + ut_a(1 == UT_LIST_GET_LEN(space->chain)); + + node = UT_LIST_GET_FIRST(space->chain); + + /* It must be a single-table tablespace and we have not opened + the file yet; the following calls will open it and update the + size fields */ + + if (!fil_node_prepare_for_io(node, fil_system, space)) { + /* The single-table tablespace can't be opened, + because the ibd file is missing. */ + return(NULL); + } + fil_node_complete_io(node, fil_system, OS_FILE_READ); + } + + return(space); +} + +/*******************************************************************//** +Returns the path from the first fil_node_t found for the space ID sent. +The caller is responsible for freeing the memory allocated here for the +value returned. +@return own: A copy of fil_node_t::path, NULL if space ID is zero +or not found. */ +UNIV_INTERN +char* +fil_space_get_first_path( +/*=====================*/ + ulint id) /*!< in: space id */ +{ + fil_space_t* space; + fil_node_t* node; + char* path; + + ut_ad(fil_system); + ut_a(id); + + fil_mutex_enter_and_prepare_for_io(id); + + space = fil_space_get_space(id); + + if (space == NULL) { + mutex_exit(&fil_system->mutex); + + return(NULL); + } + + ut_ad(mutex_own(&fil_system->mutex)); + + node = UT_LIST_GET_FIRST(space->chain); + + path = mem_strdup(node->name); + + mutex_exit(&fil_system->mutex); + + return(path); +} + +/*******************************************************************//** +Returns the size of the space in pages. The tablespace must be cached in the +memory cache. +@return space size, 0 if space not found */ +UNIV_INTERN +ulint +fil_space_get_size( +/*===============*/ + ulint id) /*!< in: space id */ +{ + fil_space_t* space; + ulint size; + + ut_ad(fil_system); + mutex_enter(&fil_system->mutex); + + space = fil_space_get_space(id); + + size = space ? space->size : 0; + + mutex_exit(&fil_system->mutex); + + return(size); +} + +/*******************************************************************//** +Returns the flags of the space. The tablespace must be cached +in the memory cache. +@return flags, ULINT_UNDEFINED if space not found */ +UNIV_INTERN +ulint +fil_space_get_flags( +/*================*/ + ulint id) /*!< in: space id */ +{ + fil_space_t* space; + ulint flags; + + ut_ad(fil_system); + + if (!id) { + return(0); + } + + mutex_enter(&fil_system->mutex); + + space = fil_space_get_space(id); + + if (space == NULL) { + mutex_exit(&fil_system->mutex); + + return(ULINT_UNDEFINED); + } + + flags = space->flags; + + mutex_exit(&fil_system->mutex); + + return(flags); +} + +/*******************************************************************//** +Returns the compressed page size of the space, or 0 if the space +is not compressed. The tablespace must be cached in the memory cache. +@return compressed page size, ULINT_UNDEFINED if space not found */ +UNIV_INTERN +ulint +fil_space_get_zip_size( +/*===================*/ + ulint id) /*!< in: space id */ +{ + ulint flags; + + flags = fil_space_get_flags(id); + + if (flags && flags != ULINT_UNDEFINED) { + + return(fsp_flags_get_zip_size(flags)); + } + + return(flags); +} + +/*******************************************************************//** +Checks if the pair space, page_no refers to an existing page in a tablespace +file space. The tablespace must be cached in the memory cache. +@return TRUE if the address is meaningful */ +UNIV_INTERN +ibool +fil_check_adress_in_tablespace( +/*===========================*/ + ulint id, /*!< in: space id */ + ulint page_no)/*!< in: page number */ +{ + if (fil_space_get_size(id) > page_no) { + + return(TRUE); + } + + return(FALSE); +} + +/****************************************************************//** +Initializes the tablespace memory cache. */ +UNIV_INTERN +void +fil_init( +/*=====*/ + ulint hash_size, /*!< in: hash table size */ + ulint max_n_open) /*!< in: max number of open files */ +{ + ut_a(fil_system == NULL); + + ut_a(hash_size > 0); + ut_a(max_n_open > 0); + + fil_system = static_cast( + mem_zalloc(sizeof(fil_system_t))); + + mutex_create(fil_system_mutex_key, + &fil_system->mutex, SYNC_ANY_LATCH); + + fil_system->spaces = hash_create(hash_size); + fil_system->name_hash = hash_create(hash_size); + + UT_LIST_INIT(fil_system->LRU); + + fil_system->max_n_open = max_n_open; +} + +/*******************************************************************//** +Opens all log files and system tablespace data files. They stay open until the +database server shutdown. This should be called at a server startup after the +space objects for the log and the system tablespace have been created. The +purpose of this operation is to make sure we never run out of file descriptors +if we need to read from the insert buffer or to write to the log. */ +UNIV_INTERN +void +fil_open_log_and_system_tablespace_files(void) +/*==========================================*/ +{ + fil_space_t* space; + + mutex_enter(&fil_system->mutex); + + for (space = UT_LIST_GET_FIRST(fil_system->space_list); + space != NULL; + space = UT_LIST_GET_NEXT(space_list, space)) { + + fil_node_t* node; + + if (fil_space_belongs_in_lru(space)) { + + continue; + } + + for (node = UT_LIST_GET_FIRST(space->chain); + node != NULL; + node = UT_LIST_GET_NEXT(chain, node)) { + + if (!node->open) { + if (!fil_node_open_file(node, fil_system, + space)) { + /* This func is called during server's + startup. If some file of log or system + tablespace is missing, the server + can't start successfully. So we should + assert for it. */ + ut_a(0); + } + } + + if (fil_system->max_n_open < 10 + fil_system->n_open) { + + fprintf(stderr, + "InnoDB: Warning: you must" + " raise the value of" + " innodb_open_files in\n" + "InnoDB: my.cnf! Remember that" + " InnoDB keeps all log files" + " and all system\n" + "InnoDB: tablespace files open" + " for the whole time mysqld is" + " running, and\n" + "InnoDB: needs to open also" + " some .ibd files if the" + " file-per-table storage\n" + "InnoDB: model is used." + " Current open files %lu," + " max allowed" + " open files %lu.\n", + (ulong) fil_system->n_open, + (ulong) fil_system->max_n_open); + } + } + } + + mutex_exit(&fil_system->mutex); +} + +/*******************************************************************//** +Closes all open files. There must not be any pending i/o's or not flushed +modifications in the files. */ +UNIV_INTERN +void +fil_close_all_files(void) +/*=====================*/ +{ + fil_space_t* space; + + mutex_enter(&fil_system->mutex); + + space = UT_LIST_GET_FIRST(fil_system->space_list); + + while (space != NULL) { + fil_node_t* node; + fil_space_t* prev_space = space; + + for (node = UT_LIST_GET_FIRST(space->chain); + node != NULL; + node = UT_LIST_GET_NEXT(chain, node)) { + + if (node->open) { + fil_node_close_file(node, fil_system); + } + } + + space = UT_LIST_GET_NEXT(space_list, space); + + fil_space_free(prev_space->id, FALSE); + } + + mutex_exit(&fil_system->mutex); +} + +/*******************************************************************//** +Closes the redo log files. There must not be any pending i/o's or not +flushed modifications in the files. */ +UNIV_INTERN +void +fil_close_log_files( +/*================*/ + bool free) /*!< in: whether to free the memory object */ +{ + fil_space_t* space; + + mutex_enter(&fil_system->mutex); + + space = UT_LIST_GET_FIRST(fil_system->space_list); + + while (space != NULL) { + fil_node_t* node; + fil_space_t* prev_space = space; + + if (space->purpose != FIL_LOG) { + space = UT_LIST_GET_NEXT(space_list, space); + continue; + } + + for (node = UT_LIST_GET_FIRST(space->chain); + node != NULL; + node = UT_LIST_GET_NEXT(chain, node)) { + + if (node->open) { + fil_node_close_file(node, fil_system); + } + } + + space = UT_LIST_GET_NEXT(space_list, space); + + if (free) { + fil_space_free(prev_space->id, FALSE); + } + } + + mutex_exit(&fil_system->mutex); +} + +/*******************************************************************//** +Sets the max tablespace id counter if the given number is bigger than the +previous value. */ +UNIV_INTERN +void +fil_set_max_space_id_if_bigger( +/*===========================*/ + ulint max_id) /*!< in: maximum known id */ +{ + if (max_id >= SRV_LOG_SPACE_FIRST_ID) { + fprintf(stderr, + "InnoDB: Fatal error: max tablespace id" + " is too high, %lu\n", (ulong) max_id); + ut_error; + } + + mutex_enter(&fil_system->mutex); + + if (fil_system->max_assigned_id < max_id) { + + fil_system->max_assigned_id = max_id; + } + + mutex_exit(&fil_system->mutex); +} + +/****************************************************************//** +Writes the flushed lsn and the latest archived log number to the page header +of the first page of a data file of the system tablespace (space 0), +which is uncompressed. */ +static __attribute__((warn_unused_result)) +dberr_t +fil_write_lsn_and_arch_no_to_file( +/*==============================*/ + ulint space, /*!< in: space to write to */ + ulint sum_of_sizes, /*!< in: combined size of previous files + in space, in database pages */ + lsn_t lsn, /*!< in: lsn to write */ + ulint arch_log_no __attribute__((unused))) + /*!< in: archived log number to write */ +{ + byte* buf1; + byte* buf; + dberr_t err; + + buf1 = static_cast(mem_alloc(2 * UNIV_PAGE_SIZE)); + buf = static_cast(ut_align(buf1, UNIV_PAGE_SIZE)); + + err = fil_read(TRUE, space, 0, sum_of_sizes, 0, + UNIV_PAGE_SIZE, buf, NULL, 0); + if (err == DB_SUCCESS) { + mach_write_to_8(buf + FIL_PAGE_FILE_FLUSH_LSN, lsn); + + err = fil_write(TRUE, space, 0, sum_of_sizes, 0, + UNIV_PAGE_SIZE, buf, NULL, 0); + } + + mem_free(buf1); + + return(err); +} + +/****************************************************************//** +Writes the flushed lsn and the latest archived log number to the page +header of the first page of each data file in the system tablespace. +@return DB_SUCCESS or error number */ +UNIV_INTERN +dberr_t +fil_write_flushed_lsn_to_data_files( +/*================================*/ + lsn_t lsn, /*!< in: lsn to write */ + ulint arch_log_no) /*!< in: latest archived log file number */ +{ + fil_space_t* space; + fil_node_t* node; + dberr_t err; + + mutex_enter(&fil_system->mutex); + + for (space = UT_LIST_GET_FIRST(fil_system->space_list); + space != NULL; + space = UT_LIST_GET_NEXT(space_list, space)) { + + /* We only write the lsn to all existing data files which have + been open during the lifetime of the mysqld process; they are + represented by the space objects in the tablespace memory + cache. Note that all data files in the system tablespace 0 + and the UNDO log tablespaces (if separate) are always open. */ + + if (space->purpose == FIL_TABLESPACE + && !fil_is_user_tablespace_id(space->id)) { + ulint sum_of_sizes = 0; + + for (node = UT_LIST_GET_FIRST(space->chain); + node != NULL; + node = UT_LIST_GET_NEXT(chain, node)) { + + mutex_exit(&fil_system->mutex); + + err = fil_write_lsn_and_arch_no_to_file( + space->id, sum_of_sizes, lsn, + arch_log_no); + + if (err != DB_SUCCESS) { + + return(err); + } + + mutex_enter(&fil_system->mutex); + + sum_of_sizes += node->size; + } + } + } + + mutex_exit(&fil_system->mutex); + + return(DB_SUCCESS); +} + +/*******************************************************************//** +Checks the consistency of the first data page of a tablespace +at database startup. +@retval NULL on success, or if innodb_force_recovery is set +@return pointer to an error message string */ +static __attribute__((warn_unused_result)) +const char* +fil_check_first_page( +/*=================*/ + const page_t* page) /*!< in: data page */ +{ + ulint space_id; + ulint flags; + + if (srv_force_recovery >= SRV_FORCE_IGNORE_CORRUPT) { + return(NULL); + } + + space_id = mach_read_from_4(FSP_HEADER_OFFSET + FSP_SPACE_ID + page); + flags = mach_read_from_4(FSP_HEADER_OFFSET + FSP_SPACE_FLAGS + page); + + if (UNIV_PAGE_SIZE != fsp_flags_get_page_size(flags)) { + fprintf(stderr, "InnoDB: Error: Current page size %lu != page size on page %lu\n", + UNIV_PAGE_SIZE, fsp_flags_get_page_size(flags)); + + return("innodb-page-size mismatch"); + } + + if (!space_id && !flags) { + ulint nonzero_bytes = UNIV_PAGE_SIZE; + const byte* b = page; + + while (!*b && --nonzero_bytes) { + b++; + } + + if (!nonzero_bytes) { + return("space header page consists of zero bytes"); + } + } + + if (buf_page_is_corrupted( + false, page, fsp_flags_get_zip_size(flags))) { + return("checksum mismatch"); + } + + if (page_get_space_id(page) == space_id + && page_get_page_no(page) == 0) { + return(NULL); + } + + return("inconsistent data in space header"); +} + +/*******************************************************************//** +Reads the flushed lsn, arch no, and tablespace flag fields from a data +file at database startup. +@retval NULL on success, or if innodb_force_recovery is set +@return pointer to an error message string */ +UNIV_INTERN +const char* +fil_read_first_page( +/*================*/ + os_file_t data_file, /*!< in: open data file */ + ibool one_read_already, /*!< in: TRUE if min and max + parameters below already + contain sensible data */ + ulint* flags, /*!< out: tablespace flags */ + ulint* space_id, /*!< out: tablespace ID */ + lsn_t* min_flushed_lsn, /*!< out: min of flushed + lsn values in data files */ + lsn_t* max_flushed_lsn, /*!< out: max of flushed + lsn values in data files */ + ulint orig_space_id) /*!< in: original file space + id */ +{ + byte* buf; + byte* page; + lsn_t flushed_lsn; + const char* check_msg = NULL; + + buf = static_cast(ut_malloc(2 * UNIV_PAGE_SIZE)); + + /* Align the memory for a possible read from a raw device */ + + page = static_cast(ut_align(buf, UNIV_PAGE_SIZE)); + + os_file_read(data_file, page, 0, UNIV_PAGE_SIZE, + orig_space_id != ULINT_UNDEFINED ? + fil_space_is_page_compressed(orig_space_id) : + FALSE); + + *flags = fsp_header_get_flags(page); + + /* Page is page compressed page, need to decompress, before + continue. */ + if (fsp_flags_is_page_compressed(*flags)) { + ulint write_size=0; + fil_decompress_page(NULL, page, UNIV_PAGE_SIZE, &write_size); + } + + *space_id = fsp_header_get_space_id(page); + + flushed_lsn = mach_read_from_8(page + FIL_PAGE_FILE_FLUSH_LSN); + + if (!one_read_already) { + check_msg = fil_check_first_page(page); + } + + ut_free(buf); + + if (check_msg) { + return(check_msg); + } + + if (!one_read_already) { + *min_flushed_lsn = flushed_lsn; + *max_flushed_lsn = flushed_lsn; + + return(NULL); + } + + if (*min_flushed_lsn > flushed_lsn) { + *min_flushed_lsn = flushed_lsn; + } + if (*max_flushed_lsn < flushed_lsn) { + *max_flushed_lsn = flushed_lsn; + } + + return(NULL); +} + +/*================ SINGLE-TABLE TABLESPACES ==========================*/ + +#ifndef UNIV_HOTBACKUP +/*******************************************************************//** +Increments the count of pending operation, if space is not being deleted. +@return TRUE if being deleted, and operation should be skipped */ +UNIV_INTERN +ibool +fil_inc_pending_ops( +/*================*/ + ulint id) /*!< in: space id */ +{ + fil_space_t* space; + + mutex_enter(&fil_system->mutex); + + space = fil_space_get_by_id(id); + + if (space == NULL) { + fprintf(stderr, + "InnoDB: Error: trying to do an operation on a" + " dropped tablespace %lu\n", + (ulong) id); + } + + if (space == NULL || space->stop_new_ops) { + mutex_exit(&fil_system->mutex); + + return(TRUE); + } + + space->n_pending_ops++; + + mutex_exit(&fil_system->mutex); + + return(FALSE); +} + +/*******************************************************************//** +Decrements the count of pending operations. */ +UNIV_INTERN +void +fil_decr_pending_ops( +/*=================*/ + ulint id) /*!< in: space id */ +{ + fil_space_t* space; + + mutex_enter(&fil_system->mutex); + + space = fil_space_get_by_id(id); + + if (space == NULL) { + fprintf(stderr, + "InnoDB: Error: decrementing pending operation" + " of a dropped tablespace %lu\n", + (ulong) id); + } + + if (space != NULL) { + space->n_pending_ops--; + } + + mutex_exit(&fil_system->mutex); +} +#endif /* !UNIV_HOTBACKUP */ + +/********************************************************//** +Creates the database directory for a table if it does not exist yet. */ +static +void +fil_create_directory_for_tablename( +/*===============================*/ + const char* name) /*!< in: name in the standard + 'databasename/tablename' format */ +{ + const char* namend; + char* path; + ulint len; + + len = strlen(fil_path_to_mysql_datadir); + namend = strchr(name, '/'); + ut_a(namend); + path = static_cast(mem_alloc(len + (namend - name) + 2)); + + memcpy(path, fil_path_to_mysql_datadir, len); + path[len] = '/'; + memcpy(path + len + 1, name, namend - name); + path[len + (namend - name) + 1] = 0; + + srv_normalize_path_for_win(path); + + ut_a(os_file_create_directory(path, FALSE)); + mem_free(path); +} + +#ifndef UNIV_HOTBACKUP +/********************************************************//** +Writes a log record about an .ibd file create/rename/delete. */ +static +void +fil_op_write_log( +/*=============*/ + ulint type, /*!< in: MLOG_FILE_CREATE, + MLOG_FILE_CREATE2, + MLOG_FILE_DELETE, or + MLOG_FILE_RENAME */ + ulint space_id, /*!< in: space id */ + ulint log_flags, /*!< in: redo log flags (stored + in the page number field) */ + ulint flags, /*!< in: compressed page size + and file format + if type==MLOG_FILE_CREATE2, or 0 */ + const char* name, /*!< in: table name in the familiar + 'databasename/tablename' format, or + the file path in the case of + MLOG_FILE_DELETE */ + const char* new_name, /*!< in: if type is MLOG_FILE_RENAME, + the new table name in the + 'databasename/tablename' format */ + mtr_t* mtr) /*!< in: mini-transaction handle */ +{ + byte* log_ptr; + ulint len; + + log_ptr = mlog_open(mtr, 11 + 2 + 1); + + if (!log_ptr) { + /* Logging in mtr is switched off during crash recovery: + in that case mlog_open returns NULL */ + return; + } + + log_ptr = mlog_write_initial_log_record_for_file_op( + type, space_id, log_flags, log_ptr, mtr); + if (type == MLOG_FILE_CREATE2) { + mach_write_to_4(log_ptr, flags); + log_ptr += 4; + } + /* Let us store the strings as null-terminated for easier readability + and handling */ + + len = strlen(name) + 1; + + mach_write_to_2(log_ptr, len); + log_ptr += 2; + mlog_close(mtr, log_ptr); + + mlog_catenate_string(mtr, (byte*) name, len); + + if (type == MLOG_FILE_RENAME) { + len = strlen(new_name) + 1; + log_ptr = mlog_open(mtr, 2 + len); + ut_a(log_ptr); + mach_write_to_2(log_ptr, len); + log_ptr += 2; + mlog_close(mtr, log_ptr); + + mlog_catenate_string(mtr, (byte*) new_name, len); + } +} +#endif + +/*******************************************************************//** +Parses the body of a log record written about an .ibd file operation. That is, +the log record part after the standard (type, space id, page no) header of the +log record. + +If desired, also replays the delete or rename operation if the .ibd file +exists and the space id in it matches. Replays the create operation if a file +at that path does not exist yet. If the database directory for the file to be +created does not exist, then we create the directory, too. + +Note that ibbackup --apply-log sets fil_path_to_mysql_datadir to point to the +datadir that we should use in replaying the file operations. + +InnoDB recovery does not replay these fully since it always sets the space id +to zero. But ibbackup does replay them. TODO: If remote tablespaces are used, +ibbackup will only create tables in the default directory since MLOG_FILE_CREATE +and MLOG_FILE_CREATE2 only know the tablename, not the path. + +@return end of log record, or NULL if the record was not completely +contained between ptr and end_ptr */ +UNIV_INTERN +byte* +fil_op_log_parse_or_replay( +/*=======================*/ + byte* ptr, /*!< in: buffer containing the log record body, + or an initial segment of it, if the record does + not fir completely between ptr and end_ptr */ + byte* end_ptr, /*!< in: buffer end */ + ulint type, /*!< in: the type of this log record */ + ulint space_id, /*!< in: the space id of the tablespace in + question, or 0 if the log record should + only be parsed but not replayed */ + ulint log_flags) /*!< in: redo log flags + (stored in the page number parameter) */ +{ + ulint name_len; + ulint new_name_len; + const char* name; + const char* new_name = NULL; + ulint flags = 0; + + if (type == MLOG_FILE_CREATE2) { + if (end_ptr < ptr + 4) { + + return(NULL); + } + + flags = mach_read_from_4(ptr); + ptr += 4; + } + + if (end_ptr < ptr + 2) { + + return(NULL); + } + + name_len = mach_read_from_2(ptr); + + ptr += 2; + + if (end_ptr < ptr + name_len) { + + return(NULL); + } + + name = (const char*) ptr; + + ptr += name_len; + + if (type == MLOG_FILE_RENAME) { + if (end_ptr < ptr + 2) { + + return(NULL); + } + + new_name_len = mach_read_from_2(ptr); + + ptr += 2; + + if (end_ptr < ptr + new_name_len) { + + return(NULL); + } + + new_name = (const char*) ptr; + + ptr += new_name_len; + } + + /* We managed to parse a full log record body */ + /* + printf("Parsed log rec of type %lu space %lu\n" + "name %s\n", type, space_id, name); + + if (type == MLOG_FILE_RENAME) { + printf("new name %s\n", new_name); + } + */ + if (!space_id) { + return(ptr); + } else { + /* Only replay file ops during recovery. This is a + release-build assert to minimize any data loss risk by a + misapplied file operation. */ + ut_a(recv_recovery_is_on()); + } + + /* Let us try to perform the file operation, if sensible. Note that + ibbackup has at this stage already read in all space id info to the + fil0fil.cc data structures. + + NOTE that our algorithm is not guaranteed to work correctly if there + were renames of tables during the backup. See ibbackup code for more + on the problem. */ + + switch (type) { + case MLOG_FILE_DELETE: + if (fil_tablespace_exists_in_mem(space_id)) { + dberr_t err = fil_delete_tablespace( + space_id, BUF_REMOVE_FLUSH_NO_WRITE); + ut_a(err == DB_SUCCESS); + } + + break; + + case MLOG_FILE_RENAME: + /* In order to replay the rename, the following must hold: + * The new name is not already used. + * A tablespace is open in memory with the old name. + * The space ID for that tablepace matches this log entry. + This will prevent unintended renames during recovery. */ + + if (fil_get_space_id_for_table(new_name) == ULINT_UNDEFINED + && space_id == fil_get_space_id_for_table(name)) { + /* Create the database directory for the new name, if + it does not exist yet */ + fil_create_directory_for_tablename(new_name); + + if (!fil_rename_tablespace(name, space_id, + new_name, NULL)) { + ut_error; + } + } + + break; + + case MLOG_FILE_CREATE: + case MLOG_FILE_CREATE2: + if (fil_tablespace_exists_in_mem(space_id)) { + /* Do nothing */ + } else if (fil_get_space_id_for_table(name) + != ULINT_UNDEFINED) { + /* Do nothing */ + } else if (log_flags & MLOG_FILE_FLAG_TEMP) { + /* Temporary table, do nothing */ + } else { + const char* path = NULL; + + /* Create the database directory for name, if it does + not exist yet */ + fil_create_directory_for_tablename(name); + + if (fil_create_new_single_table_tablespace( + space_id, name, path, flags, + DICT_TF2_USE_TABLESPACE, + FIL_IBD_FILE_INITIAL_SIZE) != DB_SUCCESS) { + ut_error; + } + } + + break; + + default: + ut_error; + } + + return(ptr); +} + +/*******************************************************************//** +Allocates a file name for the EXPORT/IMPORT config file name. The +string must be freed by caller with mem_free(). +@return own: file name */ +static +char* +fil_make_cfg_name( +/*==============*/ + const char* filepath) /*!< in: .ibd file name */ +{ + char* cfg_name; + + /* Create a temporary file path by replacing the .ibd suffix + with .cfg. */ + + ut_ad(strlen(filepath) > 4); + + cfg_name = mem_strdup(filepath); + ut_snprintf(cfg_name + strlen(cfg_name) - 3, 4, "cfg"); + return(cfg_name); +} + +/*******************************************************************//** +Check for change buffer merges. +@return 0 if no merges else count + 1. */ +static +ulint +fil_ibuf_check_pending_ops( +/*=======================*/ + fil_space_t* space, /*!< in/out: Tablespace to check */ + ulint count) /*!< in: number of attempts so far */ +{ + ut_ad(mutex_own(&fil_system->mutex)); + + if (space != 0 && space->n_pending_ops != 0) { + + if (count > 5000) { + ib_logf(IB_LOG_LEVEL_WARN, + "Trying to close/delete tablespace " + "'%s' but there are %lu pending change " + "buffer merges on it.", + space->name, + (ulong) space->n_pending_ops); + } + + return(count + 1); + } + + return(0); +} + +/*******************************************************************//** +Check for pending IO. +@return 0 if no pending else count + 1. */ +static +ulint +fil_check_pending_io( +/*=================*/ + fil_space_t* space, /*!< in/out: Tablespace to check */ + fil_node_t** node, /*!< out: Node in space list */ + ulint count) /*!< in: number of attempts so far */ +{ + ut_ad(mutex_own(&fil_system->mutex)); + ut_a(space->n_pending_ops == 0); + + /* The following code must change when InnoDB supports + multiple datafiles per tablespace. */ + ut_a(UT_LIST_GET_LEN(space->chain) == 1); + + *node = UT_LIST_GET_FIRST(space->chain); + + if (space->n_pending_flushes > 0 || (*node)->n_pending > 0) { + + ut_a(!(*node)->being_extended); + + if (count > 1000) { + ib_logf(IB_LOG_LEVEL_WARN, + "Trying to close/delete tablespace '%s' " + "but there are %lu flushes " + " and %lu pending i/o's on it.", + space->name, + (ulong) space->n_pending_flushes, + (ulong) (*node)->n_pending); + } + + return(count + 1); + } + + return(0); +} + +/*******************************************************************//** +Check pending operations on a tablespace. +@return DB_SUCCESS or error failure. */ +static +dberr_t +fil_check_pending_operations( +/*=========================*/ + ulint id, /*!< in: space id */ + fil_space_t** space, /*!< out: tablespace instance in memory */ + char** path) /*!< out/own: tablespace path */ +{ + ulint count = 0; + + ut_a(id != TRX_SYS_SPACE); + ut_ad(space); + + *space = 0; + + mutex_enter(&fil_system->mutex); + fil_space_t* sp = fil_space_get_by_id(id); + if (sp) { + sp->stop_new_ops = TRUE; + } + mutex_exit(&fil_system->mutex); + + /* Check for pending change buffer merges. */ + + do { + mutex_enter(&fil_system->mutex); + + sp = fil_space_get_by_id(id); + + count = fil_ibuf_check_pending_ops(sp, count); + + mutex_exit(&fil_system->mutex); + + if (count > 0) { + os_thread_sleep(20000); + } + + } while (count > 0); + + /* Check for pending IO. */ + + *path = 0; + + do { + mutex_enter(&fil_system->mutex); + + sp = fil_space_get_by_id(id); + + if (sp == NULL) { + mutex_exit(&fil_system->mutex); + return(DB_TABLESPACE_NOT_FOUND); + } + + fil_node_t* node; + + count = fil_check_pending_io(sp, &node, count); + + if (count == 0) { + *path = mem_strdup(node->name); + } + + mutex_exit(&fil_system->mutex); + + if (count > 0) { + os_thread_sleep(20000); + } + + } while (count > 0); + + ut_ad(sp); + + *space = sp; + return(DB_SUCCESS); +} + +/*******************************************************************//** +Closes a single-table tablespace. The tablespace must be cached in the +memory cache. Free all pages used by the tablespace. +@return DB_SUCCESS or error */ +UNIV_INTERN +dberr_t +fil_close_tablespace( +/*=================*/ + trx_t* trx, /*!< in/out: Transaction covering the close */ + ulint id) /*!< in: space id */ +{ + char* path = 0; + fil_space_t* space = 0; + + ut_a(id != TRX_SYS_SPACE); + + dberr_t err = fil_check_pending_operations(id, &space, &path); + + if (err != DB_SUCCESS) { + return(err); + } + + ut_a(space); + ut_a(path != 0); + + rw_lock_x_lock(&space->latch); + +#ifndef UNIV_HOTBACKUP + /* Invalidate in the buffer pool all pages belonging to the + tablespace. Since we have set space->stop_new_ops = TRUE, readahead + or ibuf merge can no longer read more pages of this tablespace to the + buffer pool. Thus we can clean the tablespace out of the buffer pool + completely and permanently. The flag stop_new_ops also prevents + fil_flush() from being applied to this tablespace. */ + + buf_LRU_flush_or_remove_pages(id, BUF_REMOVE_FLUSH_WRITE, trx); +#endif + mutex_enter(&fil_system->mutex); + + /* If the free is successful, the X lock will be released before + the space memory data structure is freed. */ + + if (!fil_space_free(id, TRUE)) { + rw_lock_x_unlock(&space->latch); + err = DB_TABLESPACE_NOT_FOUND; + } else { + err = DB_SUCCESS; + } + + mutex_exit(&fil_system->mutex); + + /* If it is a delete then also delete any generated files, otherwise + when we drop the database the remove directory will fail. */ + + char* cfg_name = fil_make_cfg_name(path); + + os_file_delete_if_exists(innodb_file_data_key, cfg_name); + + mem_free(path); + mem_free(cfg_name); + + return(err); +} + +/*******************************************************************//** +Deletes a single-table tablespace. The tablespace must be cached in the +memory cache. +@return DB_SUCCESS or error */ +UNIV_INTERN +dberr_t +fil_delete_tablespace( +/*==================*/ + ulint id, /*!< in: space id */ + buf_remove_t buf_remove) /*!< in: specify the action to take + on the tables pages in the buffer + pool */ +{ + char* path = 0; + fil_space_t* space = 0; + + ut_a(id != TRX_SYS_SPACE); + + dberr_t err = fil_check_pending_operations(id, &space, &path); + + if (err != DB_SUCCESS) { + + ib_logf(IB_LOG_LEVEL_ERROR, + "Cannot delete tablespace %lu because it is not " + "found in the tablespace memory cache.", + (ulong) id); + + return(err); + } + + ut_a(space); + ut_a(path != 0); + + /* Important: We rely on the data dictionary mutex to ensure + that a race is not possible here. It should serialize the tablespace + drop/free. We acquire an X latch only to avoid a race condition + when accessing the tablespace instance via: + + fsp_get_available_space_in_free_extents(). + + There our main motivation is to reduce the contention on the + dictionary mutex. */ + + rw_lock_x_lock(&space->latch); + +#ifndef UNIV_HOTBACKUP + /* IMPORTANT: Because we have set space::stop_new_ops there + can't be any new ibuf merges, reads or flushes. We are here + because node::n_pending was zero above. However, it is still + possible to have pending read and write requests: + + A read request can happen because the reader thread has + gone through the ::stop_new_ops check in buf_page_init_for_read() + before the flag was set and has not yet incremented ::n_pending + when we checked it above. + + A write request can be issued any time because we don't check + the ::stop_new_ops flag when queueing a block for write. + + We deal with pending write requests in the following function + where we'd minimally evict all dirty pages belonging to this + space from the flush_list. Not that if a block is IO-fixed + we'll wait for IO to complete. + + To deal with potential read requests by checking the + ::stop_new_ops flag in fil_io() */ + + buf_LRU_flush_or_remove_pages(id, buf_remove, 0); + +#endif /* !UNIV_HOTBACKUP */ + + /* If it is a delete then also delete any generated files, otherwise + when we drop the database the remove directory will fail. */ + { + char* cfg_name = fil_make_cfg_name(path); + os_file_delete_if_exists(innodb_file_data_key, cfg_name); + mem_free(cfg_name); + } + + /* Delete the link file pointing to the ibd file we are deleting. */ + if (FSP_FLAGS_HAS_DATA_DIR(space->flags)) { + fil_delete_link_file(space->name); + } + + mutex_enter(&fil_system->mutex); + + /* Double check the sanity of pending ops after reacquiring + the fil_system::mutex. */ + if (fil_space_get_by_id(id)) { + ut_a(space->n_pending_ops == 0); + ut_a(UT_LIST_GET_LEN(space->chain) == 1); + fil_node_t* node = UT_LIST_GET_FIRST(space->chain); + ut_a(node->n_pending == 0); + } + + if (!fil_space_free(id, TRUE)) { + err = DB_TABLESPACE_NOT_FOUND; + } + + mutex_exit(&fil_system->mutex); + + if (err != DB_SUCCESS) { + rw_lock_x_unlock(&space->latch); + } else if (!os_file_delete(innodb_file_data_key, path) + && !os_file_delete_if_exists(innodb_file_data_key, path)) { + + /* Note: This is because we have removed the + tablespace instance from the cache. */ + + err = DB_IO_ERROR; + } + + if (err == DB_SUCCESS) { +#ifndef UNIV_HOTBACKUP + /* Write a log record about the deletion of the .ibd + file, so that ibbackup can replay it in the + --apply-log phase. We use a dummy mtr and the familiar + log write mechanism. */ + mtr_t mtr; + + /* When replaying the operation in ibbackup, do not try + to write any log record */ + mtr_start(&mtr); + + fil_op_write_log(MLOG_FILE_DELETE, id, 0, 0, path, NULL, &mtr); + mtr_commit(&mtr); +#endif + err = DB_SUCCESS; + } + + mem_free(path); + + return(err); +} + +/*******************************************************************//** +Returns TRUE if a single-table tablespace is being deleted. +@return TRUE if being deleted */ +UNIV_INTERN +ibool +fil_tablespace_is_being_deleted( +/*============================*/ + ulint id) /*!< in: space id */ +{ + fil_space_t* space; + ibool is_being_deleted; + + mutex_enter(&fil_system->mutex); + + space = fil_space_get_by_id(id); + + ut_a(space != NULL); + + is_being_deleted = space->stop_new_ops; + + mutex_exit(&fil_system->mutex); + + return(is_being_deleted); +} + +#ifndef UNIV_HOTBACKUP +/*******************************************************************//** +Discards a single-table tablespace. The tablespace must be cached in the +memory cache. Discarding is like deleting a tablespace, but + + 1. We do not drop the table from the data dictionary; + + 2. We remove all insert buffer entries for the tablespace immediately; + in DROP TABLE they are only removed gradually in the background; + + 3. Free all the pages in use by the tablespace. +@return DB_SUCCESS or error */ +UNIV_INTERN +dberr_t +fil_discard_tablespace( +/*===================*/ + ulint id) /*!< in: space id */ +{ + dberr_t err; + + switch (err = fil_delete_tablespace(id, BUF_REMOVE_ALL_NO_WRITE)) { + case DB_SUCCESS: + break; + + case DB_IO_ERROR: + ib_logf(IB_LOG_LEVEL_WARN, + "While deleting tablespace %lu in DISCARD TABLESPACE." + " File rename/delete failed: %s", + (ulong) id, ut_strerr(err)); + break; + + case DB_TABLESPACE_NOT_FOUND: + ib_logf(IB_LOG_LEVEL_WARN, + "Cannot delete tablespace %lu in DISCARD " + "TABLESPACE. %s", + (ulong) id, ut_strerr(err)); + break; + + default: + ut_error; + } + + /* Remove all insert buffer entries for the tablespace */ + + ibuf_delete_for_discarded_space(id); + + return(err); +} +#endif /* !UNIV_HOTBACKUP */ + +/*******************************************************************//** +Renames the memory cache structures of a single-table tablespace. +@return TRUE if success */ +static +ibool +fil_rename_tablespace_in_mem( +/*=========================*/ + fil_space_t* space, /*!< in: tablespace memory object */ + fil_node_t* node, /*!< in: file node of that tablespace */ + const char* new_name, /*!< in: new name */ + const char* new_path) /*!< in: new file path */ +{ + fil_space_t* space2; + const char* old_name = space->name; + + ut_ad(mutex_own(&fil_system->mutex)); + + space2 = fil_space_get_by_name(old_name); + if (space != space2) { + fputs("InnoDB: Error: cannot find ", stderr); + ut_print_filename(stderr, old_name); + fputs(" in tablespace memory cache\n", stderr); + + return(FALSE); + } + + space2 = fil_space_get_by_name(new_name); + if (space2 != NULL) { + fputs("InnoDB: Error: ", stderr); + ut_print_filename(stderr, new_name); + fputs(" is already in tablespace memory cache\n", stderr); + + return(FALSE); + } + + HASH_DELETE(fil_space_t, name_hash, fil_system->name_hash, + ut_fold_string(space->name), space); + mem_free(space->name); + mem_free(node->name); + + space->name = mem_strdup(new_name); + node->name = mem_strdup(new_path); + + HASH_INSERT(fil_space_t, name_hash, fil_system->name_hash, + ut_fold_string(new_name), space); + return(TRUE); +} + +/*******************************************************************//** +Allocates a file name for a single-table tablespace. The string must be freed +by caller with mem_free(). +@return own: file name */ +UNIV_INTERN +char* +fil_make_ibd_name( +/*==============*/ + const char* name, /*!< in: table name or a dir path */ + bool is_full_path) /*!< in: TRUE if it is a dir path */ +{ + char* filename; + ulint namelen = strlen(name); + ulint dirlen = strlen(fil_path_to_mysql_datadir); + ulint pathlen = dirlen + namelen + sizeof "/.ibd"; + + filename = static_cast(mem_alloc(pathlen)); + + if (is_full_path) { + memcpy(filename, name, namelen); + memcpy(filename + namelen, ".ibd", sizeof ".ibd"); + } else { + ut_snprintf(filename, pathlen, "%s/%s.ibd", + fil_path_to_mysql_datadir, name); + + } + + srv_normalize_path_for_win(filename); + + return(filename); +} + +/*******************************************************************//** +Allocates a file name for a tablespace ISL file (InnoDB Symbolic Link). +The string must be freed by caller with mem_free(). +@return own: file name */ +UNIV_INTERN +char* +fil_make_isl_name( +/*==============*/ + const char* name) /*!< in: table name */ +{ + char* filename; + ulint namelen = strlen(name); + ulint dirlen = strlen(fil_path_to_mysql_datadir); + ulint pathlen = dirlen + namelen + sizeof "/.isl"; + + filename = static_cast(mem_alloc(pathlen)); + + ut_snprintf(filename, pathlen, "%s/%s.isl", + fil_path_to_mysql_datadir, name); + + srv_normalize_path_for_win(filename); + + return(filename); +} + +/*******************************************************************//** +Renames a single-table tablespace. The tablespace must be cached in the +tablespace memory cache. +@return TRUE if success */ +UNIV_INTERN +ibool +fil_rename_tablespace( +/*==================*/ + const char* old_name_in, /*!< in: old table name in the + standard databasename/tablename + format of InnoDB, or NULL if we + do the rename based on the space + id only */ + ulint id, /*!< in: space id */ + const char* new_name, /*!< in: new table name in the + standard databasename/tablename + format of InnoDB */ + const char* new_path_in) /*!< in: new full datafile path + if the tablespace is remotely + located, or NULL if it is located + in the normal data directory. */ +{ + ibool success; + fil_space_t* space; + fil_node_t* node; + ulint count = 0; + char* new_path; + char* old_name; + char* old_path; + const char* not_given = "(name not specified)"; + + ut_a(id != 0); + +retry: + count++; + + if (!(count % 1000)) { + ut_print_timestamp(stderr); + fputs(" InnoDB: Warning: problems renaming ", stderr); + ut_print_filename(stderr, + old_name_in ? old_name_in : not_given); + fputs(" to ", stderr); + ut_print_filename(stderr, new_name); + fprintf(stderr, ", %lu iterations\n", (ulong) count); + } + + mutex_enter(&fil_system->mutex); + + space = fil_space_get_by_id(id); + + DBUG_EXECUTE_IF("fil_rename_tablespace_failure_1", space = NULL; ); + + if (space == NULL) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Cannot find space id %lu in the tablespace " + "memory cache, though the table '%s' in a " + "rename operation should have that id.", + (ulong) id, old_name_in ? old_name_in : not_given); + mutex_exit(&fil_system->mutex); + + return(FALSE); + } + + if (count > 25000) { + space->stop_ios = FALSE; + mutex_exit(&fil_system->mutex); + + return(FALSE); + } + + /* We temporarily close the .ibd file because we do not trust that + operating systems can rename an open file. For the closing we have to + wait until there are no pending i/o's or flushes on the file. */ + + space->stop_ios = TRUE; + + /* The following code must change when InnoDB supports + multiple datafiles per tablespace. */ + ut_a(UT_LIST_GET_LEN(space->chain) == 1); + node = UT_LIST_GET_FIRST(space->chain); + + if (node->n_pending > 0 + || node->n_pending_flushes > 0 + || node->being_extended) { + /* There are pending i/o's or flushes or the file is + currently being extended, sleep for a while and + retry */ + + mutex_exit(&fil_system->mutex); + + os_thread_sleep(20000); + + goto retry; + + } else if (node->modification_counter > node->flush_counter) { + /* Flush the space */ + + mutex_exit(&fil_system->mutex); + + os_thread_sleep(20000); + + fil_flush(id); + + goto retry; + + } else if (node->open) { + /* Close the file */ + + fil_node_close_file(node, fil_system); + } + + /* Check that the old name in the space is right */ + + if (old_name_in) { + old_name = mem_strdup(old_name_in); + ut_a(strcmp(space->name, old_name) == 0); + } else { + old_name = mem_strdup(space->name); + } + old_path = mem_strdup(node->name); + + /* Rename the tablespace and the node in the memory cache */ + new_path = new_path_in ? mem_strdup(new_path_in) + : fil_make_ibd_name(new_name, false); + + success = fil_rename_tablespace_in_mem( + space, node, new_name, new_path); + + if (success) { + + DBUG_EXECUTE_IF("fil_rename_tablespace_failure_2", + goto skip_second_rename; ); + + success = os_file_rename( + innodb_file_data_key, old_path, new_path); + + DBUG_EXECUTE_IF("fil_rename_tablespace_failure_2", +skip_second_rename: + success = FALSE; ); + + if (!success) { + /* We have to revert the changes we made + to the tablespace memory cache */ + + ut_a(fil_rename_tablespace_in_mem( + space, node, old_name, old_path)); + } + } + + space->stop_ios = FALSE; + + mutex_exit(&fil_system->mutex); + +#ifndef UNIV_HOTBACKUP + if (success && !recv_recovery_on) { + mtr_t mtr; + + mtr_start(&mtr); + + fil_op_write_log(MLOG_FILE_RENAME, id, 0, 0, old_name, new_name, + &mtr); + mtr_commit(&mtr); + } +#endif /* !UNIV_HOTBACKUP */ + + mem_free(new_path); + mem_free(old_path); + mem_free(old_name); + + return(success); +} + +/*******************************************************************//** +Creates a new InnoDB Symbolic Link (ISL) file. It is always created +under the 'datadir' of MySQL. The datadir is the directory of a +running mysqld program. We can refer to it by simply using the path '.'. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +fil_create_link_file( +/*=================*/ + const char* tablename, /*!< in: tablename */ + const char* filepath) /*!< in: pathname of tablespace */ +{ + os_file_t file; + ibool success; + dberr_t err = DB_SUCCESS; + char* link_filepath; + char* prev_filepath = fil_read_link_file(tablename); + + ut_ad(!srv_read_only_mode); + + if (prev_filepath) { + /* Truncate will call this with an existing + link file which contains the same filepath. */ + if (0 == strcmp(prev_filepath, filepath)) { + mem_free(prev_filepath); + return(DB_SUCCESS); + } + mem_free(prev_filepath); + } + + link_filepath = fil_make_isl_name(tablename); + + file = os_file_create_simple_no_error_handling( + innodb_file_data_key, link_filepath, + OS_FILE_CREATE, OS_FILE_READ_WRITE, &success, 0); + + if (!success) { + /* The following call will print an error message */ + ulint error = os_file_get_last_error(true); + + ut_print_timestamp(stderr); + fputs(" InnoDB: Cannot create file ", stderr); + ut_print_filename(stderr, link_filepath); + fputs(".\n", stderr); + + if (error == OS_FILE_ALREADY_EXISTS) { + fputs("InnoDB: The link file: ", stderr); + ut_print_filename(stderr, filepath); + fputs(" already exists.\n", stderr); + err = DB_TABLESPACE_EXISTS; + + } else if (error == OS_FILE_DISK_FULL) { + err = DB_OUT_OF_FILE_SPACE; + + } else if (error == OS_FILE_OPERATION_NOT_SUPPORTED) { + err = DB_UNSUPPORTED; + } else { + err = DB_ERROR; + } + + /* file is not open, no need to close it. */ + mem_free(link_filepath); + return(err); + } + + if (!os_file_write(link_filepath, file, filepath, 0, + strlen(filepath))) { + err = DB_ERROR; + } + + /* Close the file, we only need it at startup */ + os_file_close(file); + + mem_free(link_filepath); + + return(err); +} + +/*******************************************************************//** +Deletes an InnoDB Symbolic Link (ISL) file. */ +UNIV_INTERN +void +fil_delete_link_file( +/*=================*/ + const char* tablename) /*!< in: name of table */ +{ + char* link_filepath = fil_make_isl_name(tablename); + + os_file_delete_if_exists(innodb_file_data_key, link_filepath); + + mem_free(link_filepath); +} + +/*******************************************************************//** +Reads an InnoDB Symbolic Link (ISL) file. +It is always created under the 'datadir' of MySQL. The name is of the +form {databasename}/{tablename}. and the isl file is expected to be in a +'{databasename}' directory called '{tablename}.isl'. The caller must free +the memory of the null-terminated path returned if it is not null. +@return own: filepath found in link file, NULL if not found. */ +UNIV_INTERN +char* +fil_read_link_file( +/*===============*/ + const char* name) /*!< in: tablespace name */ +{ + char* filepath = NULL; + char* link_filepath; + FILE* file = NULL; + + /* The .isl file is in the 'normal' tablespace location. */ + link_filepath = fil_make_isl_name(name); + + file = fopen(link_filepath, "r+b"); + + mem_free(link_filepath); + + if (file) { + filepath = static_cast(mem_alloc(OS_FILE_MAX_PATH)); + + os_file_read_string(file, filepath, OS_FILE_MAX_PATH); + fclose(file); + + if (strlen(filepath)) { + /* Trim whitespace from end of filepath */ + ulint lastch = strlen(filepath) - 1; + while (lastch > 4 && filepath[lastch] <= 0x20) { + filepath[lastch--] = 0x00; + } + srv_normalize_path_for_win(filepath); + } + } + + return(filepath); +} + +/*******************************************************************//** +Opens a handle to the file linked to in an InnoDB Symbolic Link file. +@return TRUE if remote linked tablespace file is found and opened. */ +UNIV_INTERN +ibool +fil_open_linked_file( +/*===============*/ + const char* tablename, /*!< in: database/tablename */ + char** remote_filepath,/*!< out: remote filepath */ + os_file_t* remote_file, /*!< out: remote file handle */ + ulint atomic_writes) /*!< in: atomic writes table option + value */ +{ + ibool success; + + *remote_filepath = fil_read_link_file(tablename); + if (*remote_filepath == NULL) { + return(FALSE); + } + + /* The filepath provided is different from what was + found in the link file. */ + *remote_file = os_file_create_simple_no_error_handling( + innodb_file_data_key, *remote_filepath, + OS_FILE_OPEN, OS_FILE_READ_ONLY, + &success, atomic_writes); + + if (!success) { + char* link_filepath = fil_make_isl_name(tablename); + + /* The following call prints an error message */ + os_file_get_last_error(true); + + ib_logf(IB_LOG_LEVEL_ERROR, + "A link file was found named '%s' " + "but the linked tablespace '%s' " + "could not be opened.", + link_filepath, *remote_filepath); + + mem_free(link_filepath); + mem_free(*remote_filepath); + *remote_filepath = NULL; + } + + return(success); +} + +/*******************************************************************//** +Creates a new single-table tablespace to a database directory of MySQL. +Database directories are under the 'datadir' of MySQL. The datadir is the +directory of a running mysqld program. We can refer to it by simply the +path '.'. Tables created with CREATE TEMPORARY TABLE we place in the temp +dir of the mysqld server. + +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +fil_create_new_single_table_tablespace( +/*===================================*/ + ulint space_id, /*!< in: space id */ + const char* tablename, /*!< in: the table name in the usual + databasename/tablename format + of InnoDB */ + const char* dir_path, /*!< in: NULL or a dir path */ + ulint flags, /*!< in: tablespace flags */ + ulint flags2, /*!< in: table flags2 */ + ulint size) /*!< in: the initial size of the + tablespace file in pages, + must be >= FIL_IBD_FILE_INITIAL_SIZE */ +{ + os_file_t file; + ibool ret; + dberr_t err; + byte* buf2; + byte* page; + char* path; + ibool success; + /* TRUE if a table is created with CREATE TEMPORARY TABLE */ + bool is_temp = !!(flags2 & DICT_TF2_TEMPORARY); + bool has_data_dir = FSP_FLAGS_HAS_DATA_DIR(flags); + ulint atomic_writes = FSP_FLAGS_GET_ATOMIC_WRITES(flags); + + ut_a(space_id > 0); + ut_ad(!srv_read_only_mode); + ut_a(space_id < SRV_LOG_SPACE_FIRST_ID); + ut_a(size >= FIL_IBD_FILE_INITIAL_SIZE); + ut_a(fsp_flags_is_valid(flags)); + + if (is_temp) { + /* Temporary table filepath */ + ut_ad(dir_path); + path = fil_make_ibd_name(dir_path, true); + } else if (has_data_dir) { + ut_ad(dir_path); + path = os_file_make_remote_pathname(dir_path, tablename, "ibd"); + + /* Since this tablespace file will be created in a + remote directory, let's create the subdirectories + in the path, if they are not there already. */ + success = os_file_create_subdirs_if_needed(path); + if (!success) { + err = DB_ERROR; + goto error_exit_3; + } + } else { + path = fil_make_ibd_name(tablename, false); + } + + file = os_file_create( + innodb_file_data_key, path, + OS_FILE_CREATE | OS_FILE_ON_ERROR_NO_EXIT, + OS_FILE_NORMAL, + OS_DATA_FILE, + &ret, + atomic_writes); + + if (ret == FALSE) { + /* The following call will print an error message */ + ulint error = os_file_get_last_error(true); + + ib_logf(IB_LOG_LEVEL_ERROR, + "Cannot create file '%s'\n", path); + + if (error == OS_FILE_ALREADY_EXISTS) { + ib_logf(IB_LOG_LEVEL_ERROR, + "The file '%s' already exists though the " + "corresponding table did not exist " + "in the InnoDB data dictionary. " + "Have you moved InnoDB .ibd files " + "around without using the SQL commands " + "DISCARD TABLESPACE and IMPORT TABLESPACE, " + "or did mysqld crash in the middle of " + "CREATE TABLE? " + "You can resolve the problem by removing " + "the file '%s' under the 'datadir' of MySQL.", + path, path); + + err = DB_TABLESPACE_EXISTS; + goto error_exit_3; + } + + if (error == OS_FILE_OPERATION_NOT_SUPPORTED) { + err = DB_UNSUPPORTED; + goto error_exit_3; + } + + if (error == OS_FILE_DISK_FULL) { + err = DB_OUT_OF_FILE_SPACE; + goto error_exit_3; + } + + err = DB_ERROR; + goto error_exit_3; + } + + ret = os_file_set_size(path, file, size * UNIV_PAGE_SIZE); + + if (!ret) { + err = DB_OUT_OF_FILE_SPACE; + goto error_exit_2; + } + + /* printf("Creating tablespace %s id %lu\n", path, space_id); */ + + /* We have to write the space id to the file immediately and flush the + file to disk. This is because in crash recovery we must be aware what + tablespaces exist and what are their space id's, so that we can apply + the log records to the right file. It may take quite a while until + buffer pool flush algorithms write anything to the file and flush it to + disk. If we would not write here anything, the file would be filled + with zeros from the call of os_file_set_size(), until a buffer pool + flush would write to it. */ + + buf2 = static_cast(ut_malloc(3 * UNIV_PAGE_SIZE)); + /* Align the memory for file i/o if we might have O_DIRECT set */ + page = static_cast(ut_align(buf2, UNIV_PAGE_SIZE)); + + memset(page, '\0', UNIV_PAGE_SIZE); + + /* Add the UNIV_PAGE_SIZE to the table flags and write them to the + tablespace header. */ + flags = fsp_flags_set_page_size(flags, UNIV_PAGE_SIZE); + fsp_header_init_fields(page, space_id, flags); + mach_write_to_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, space_id); + ut_ad(fsp_flags_is_valid(flags)); + + if (!(fsp_flags_is_compressed(flags))) { + buf_flush_init_for_writing(page, NULL, 0); + ret = os_file_write(path, file, page, 0, UNIV_PAGE_SIZE); + } else { + page_zip_des_t page_zip; + ulint zip_size; + + zip_size = fsp_flags_get_zip_size(flags); + + page_zip_set_size(&page_zip, zip_size); + page_zip.data = page + UNIV_PAGE_SIZE; +#ifdef UNIV_DEBUG + page_zip.m_start = +#endif /* UNIV_DEBUG */ + page_zip.m_end = page_zip.m_nonempty = + page_zip.n_blobs = 0; + buf_flush_init_for_writing(page, &page_zip, 0); + ret = os_file_write(path, file, page_zip.data, 0, zip_size); + } + + ut_free(buf2); + + if (!ret) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Could not write the first page to tablespace " + "'%s'", path); + + err = DB_ERROR; + goto error_exit_2; + } + + ret = os_file_flush(file); + + if (!ret) { + ib_logf(IB_LOG_LEVEL_ERROR, + "File flush of tablespace '%s' failed", path); + err = DB_ERROR; + goto error_exit_2; + } + + if (has_data_dir) { + /* Now that the IBD file is created, make the ISL file. */ + err = fil_create_link_file(tablename, path); + if (err != DB_SUCCESS) { + goto error_exit_2; + } + } + + success = fil_space_create(tablename, space_id, flags, FIL_TABLESPACE); + if (!success || !fil_node_create(path, size, space_id, FALSE)) { + err = DB_ERROR; + goto error_exit_1; + } + +#ifndef UNIV_HOTBACKUP + { + mtr_t mtr; + ulint mlog_file_flag = 0; + + if (is_temp) { + mlog_file_flag |= MLOG_FILE_FLAG_TEMP; + } + + mtr_start(&mtr); + + fil_op_write_log(flags + ? MLOG_FILE_CREATE2 + : MLOG_FILE_CREATE, + space_id, mlog_file_flag, flags, + tablename, NULL, &mtr); + + mtr_commit(&mtr); + } +#endif + err = DB_SUCCESS; + + /* Error code is set. Cleanup the various variables used. + These labels reflect the order in which variables are assigned or + actions are done. */ +error_exit_1: + if (has_data_dir && err != DB_SUCCESS) { + fil_delete_link_file(tablename); + } +error_exit_2: + os_file_close(file); + if (err != DB_SUCCESS) { + os_file_delete(innodb_file_data_key, path); + } +error_exit_3: + mem_free(path); + + return(err); +} + +#ifndef UNIV_HOTBACKUP +/********************************************************************//** +Report information about a bad tablespace. */ +static +void +fil_report_bad_tablespace( +/*======================*/ + const char* filepath, /*!< in: filepath */ + const char* check_msg, /*!< in: fil_check_first_page() */ + ulint found_id, /*!< in: found space ID */ + ulint found_flags, /*!< in: found flags */ + ulint expected_id, /*!< in: expected space id */ + ulint expected_flags) /*!< in: expected flags */ +{ + if (check_msg) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Error %s in file '%s'," + "tablespace id=%lu, flags=%lu. " + "Please refer to " + REFMAN "innodb-troubleshooting-datadict.html " + "for how to resolve the issue.", + check_msg, filepath, + (ulong) expected_id, (ulong) expected_flags); + return; + } + + ib_logf(IB_LOG_LEVEL_ERROR, + "In file '%s', tablespace id and flags are %lu and %lu, " + "but in the InnoDB data dictionary they are %lu and %lu. " + "Have you moved InnoDB .ibd files around without using the " + "commands DISCARD TABLESPACE and IMPORT TABLESPACE? " + "Please refer to " + REFMAN "innodb-troubleshooting-datadict.html " + "for how to resolve the issue.", + filepath, (ulong) found_id, (ulong) found_flags, + (ulong) expected_id, (ulong) expected_flags); +} + +/********************************************************************//** +Tries to open a single-table tablespace and optionally checks that the +space id in it is correct. If this does not succeed, print an error message +to the .err log. This function is used to open a tablespace when we start +mysqld after the dictionary has been booted, and also in IMPORT TABLESPACE. + +NOTE that we assume this operation is used either at the database startup +or under the protection of the dictionary mutex, so that two users cannot +race here. This operation does not leave the file associated with the +tablespace open, but closes it after we have looked at the space id in it. + +If the validate boolean is set, we read the first page of the file and +check that the space id in the file is what we expect. We assume that +this function runs much faster if no check is made, since accessing the +file inode probably is much faster (the OS caches them) than accessing +the first page of the file. This boolean may be initially FALSE, but if +a remote tablespace is found it will be changed to true. + +If the fix_dict boolean is set, then it is safe to use an internal SQL +statement to update the dictionary tables if they are incorrect. + +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +fil_open_single_table_tablespace( +/*=============================*/ + bool validate, /*!< in: Do we validate tablespace? */ + bool fix_dict, /*!< in: Can we fix the dictionary? */ + ulint id, /*!< in: space id */ + ulint flags, /*!< in: tablespace flags */ + const char* tablename, /*!< in: table name in the + databasename/tablename format */ + const char* path_in) /*!< in: tablespace filepath */ +{ + dberr_t err = DB_SUCCESS; + bool dict_filepath_same_as_default = false; + bool link_file_found = false; + bool link_file_is_bad = false; + fsp_open_info def; + fsp_open_info dict; + fsp_open_info remote; + ulint tablespaces_found = 0; + ulint valid_tablespaces_found = 0; + ulint atomic_writes = 0; + +#ifdef UNIV_SYNC_DEBUG + ut_ad(!fix_dict || rw_lock_own(&dict_operation_lock, RW_LOCK_EX)); +#endif /* UNIV_SYNC_DEBUG */ + ut_ad(!fix_dict || mutex_own(&(dict_sys->mutex))); + + /* Table flags can be ULINT_UNDEFINED if + dict_tf_to_fsp_flags_failure is set. */ + if (flags != ULINT_UNDEFINED) { + if (!fsp_flags_is_valid(flags)) { + return(DB_CORRUPTION); + } + } else { + return(DB_CORRUPTION); + } + + atomic_writes = fsp_flags_get_atomic_writes(flags); + + /* If the tablespace was relocated, we do not + compare the DATA_DIR flag */ + ulint mod_flags = flags & ~FSP_FLAGS_MASK_DATA_DIR; + + memset(&def, 0, sizeof(def)); + memset(&dict, 0, sizeof(dict)); + memset(&remote, 0, sizeof(remote)); + + /* Discover the correct filepath. We will always look for an ibd + in the default location. If it is remote, it should not be here. */ + def.filepath = fil_make_ibd_name(tablename, false); + + /* The path_in was read from SYS_DATAFILES. */ + if (path_in) { + if (strcmp(def.filepath, path_in)) { + dict.filepath = mem_strdup(path_in); + /* possibility of multiple files. */ + validate = true; + } else { + dict_filepath_same_as_default = true; + } + } + + link_file_found = fil_open_linked_file( + tablename, &remote.filepath, &remote.file, atomic_writes); + remote.success = link_file_found; + if (remote.success) { + /* possibility of multiple files. */ + validate = true; + tablespaces_found++; + + /* A link file was found. MySQL does not allow a DATA + DIRECTORY to be be the same as the default filepath. */ + ut_a(strcmp(def.filepath, remote.filepath)); + + /* If there was a filepath found in SYS_DATAFILES, + we hope it was the same as this remote.filepath found + in the ISL file. */ + if (dict.filepath + && (0 == strcmp(dict.filepath, remote.filepath))) { + remote.success = FALSE; + os_file_close(remote.file); + mem_free(remote.filepath); + remote.filepath = NULL; + tablespaces_found--; + } + } + + /* Attempt to open the tablespace at other possible filepaths. */ + if (dict.filepath) { + dict.file = os_file_create_simple_no_error_handling( + innodb_file_data_key, dict.filepath, OS_FILE_OPEN, + OS_FILE_READ_ONLY, &dict.success, atomic_writes); + if (dict.success) { + /* possibility of multiple files. */ + validate = true; + tablespaces_found++; + } + } + + /* Always look for a file at the default location. */ + ut_a(def.filepath); + def.file = os_file_create_simple_no_error_handling( + innodb_file_data_key, def.filepath, OS_FILE_OPEN, + OS_FILE_READ_ONLY, &def.success, atomic_writes); + if (def.success) { + tablespaces_found++; + } + + /* We have now checked all possible tablespace locations and + have a count of how many we found. If things are normal, we + only found 1. */ + if (!validate && tablespaces_found == 1) { + goto skip_validate; + } + + /* Read the first page of the datadir tablespace, if found. */ + if (def.success) { + def.check_msg = fil_read_first_page( + def.file, FALSE, &def.flags, &def.id, + &def.lsn, &def.lsn, id); + def.valid = !def.check_msg; + + /* Validate this single-table-tablespace with SYS_TABLES, + but do not compare the DATA_DIR flag, in case the + tablespace was relocated. */ + if (def.valid && def.id == id + && (def.flags & ~FSP_FLAGS_MASK_DATA_DIR) == mod_flags) { + valid_tablespaces_found++; + } else { + def.valid = false; + /* Do not use this tablespace. */ + fil_report_bad_tablespace( + def.filepath, def.check_msg, def.id, + def.flags, id, flags); + } + } + + /* Read the first page of the remote tablespace */ + if (remote.success) { + remote.check_msg = fil_read_first_page( + remote.file, FALSE, &remote.flags, &remote.id, + &remote.lsn, &remote.lsn, id); + remote.valid = !remote.check_msg; + + /* Validate this single-table-tablespace with SYS_TABLES, + but do not compare the DATA_DIR flag, in case the + tablespace was relocated. */ + if (remote.valid && remote.id == id + && (remote.flags & ~FSP_FLAGS_MASK_DATA_DIR) == mod_flags) { + valid_tablespaces_found++; + } else { + remote.valid = false; + /* Do not use this linked tablespace. */ + fil_report_bad_tablespace( + remote.filepath, remote.check_msg, remote.id, + remote.flags, id, flags); + link_file_is_bad = true; + } + } + + /* Read the first page of the datadir tablespace, if found. */ + if (dict.success) { + dict.check_msg = fil_read_first_page( + dict.file, FALSE, &dict.flags, &dict.id, + &dict.lsn, &dict.lsn, id); + dict.valid = !dict.check_msg; + + /* Validate this single-table-tablespace with SYS_TABLES, + but do not compare the DATA_DIR flag, in case the + tablespace was relocated. */ + if (dict.valid && dict.id == id + && (dict.flags & ~FSP_FLAGS_MASK_DATA_DIR) == mod_flags) { + valid_tablespaces_found++; + } else { + dict.valid = false; + /* Do not use this tablespace. */ + fil_report_bad_tablespace( + dict.filepath, dict.check_msg, dict.id, + dict.flags, id, flags); + } + } + + /* Make sense of these three possible locations. + First, bail out if no tablespace files were found. */ + if (valid_tablespaces_found == 0) { + /* The following call prints an error message */ + os_file_get_last_error(true); + + ib_logf(IB_LOG_LEVEL_ERROR, + "Could not find a valid tablespace file for '%s'. " + "See " REFMAN "innodb-troubleshooting-datadict.html " + "for how to resolve the issue.", + tablename); + + err = DB_CORRUPTION; + + goto cleanup_and_exit; + } + + /* Do not open any tablespaces if more than one tablespace with + the correct space ID and flags were found. */ + if (tablespaces_found > 1) { + ib_logf(IB_LOG_LEVEL_ERROR, + "A tablespace for %s has been found in " + "multiple places;", tablename); + if (def.success) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Default location; %s, LSN=" LSN_PF + ", Space ID=%lu, Flags=%lu", + def.filepath, def.lsn, + (ulong) def.id, (ulong) def.flags); + } + if (remote.success) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Remote location; %s, LSN=" LSN_PF + ", Space ID=%lu, Flags=%lu", + remote.filepath, remote.lsn, + (ulong) remote.id, (ulong) remote.flags); + } + if (dict.success) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Dictionary location; %s, LSN=" LSN_PF + ", Space ID=%lu, Flags=%lu", + dict.filepath, dict.lsn, + (ulong) dict.id, (ulong) dict.flags); + } + + /* Force-recovery will allow some tablespaces to be + skipped by REDO if there was more than one file found. + Unlike during the REDO phase of recovery, we now know + if the tablespace is valid according to the dictionary, + which was not available then. So if we did not force + recovery and there is only one good tablespace, ignore + any bad tablespaces. */ + if (valid_tablespaces_found > 1 || srv_force_recovery > 0) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Will not open the tablespace for '%s'", + tablename); + + if (def.success != def.valid + || dict.success != dict.valid + || remote.success != remote.valid) { + err = DB_CORRUPTION; + } else { + err = DB_ERROR; + } + goto cleanup_and_exit; + } + + /* There is only one valid tablespace found and we did + not use srv_force_recovery during REDO. Use this one + tablespace and clean up invalid tablespace pointers */ + if (def.success && !def.valid) { + def.success = false; + os_file_close(def.file); + tablespaces_found--; + } + if (dict.success && !dict.valid) { + dict.success = false; + os_file_close(dict.file); + /* Leave dict.filepath so that SYS_DATAFILES + can be corrected below. */ + tablespaces_found--; + } + if (remote.success && !remote.valid) { + remote.success = false; + os_file_close(remote.file); + mem_free(remote.filepath); + remote.filepath = NULL; + tablespaces_found--; + } + } + + /* At this point, there should be only one filepath. */ + ut_a(tablespaces_found == 1); + ut_a(valid_tablespaces_found == 1); + + /* Only fix the dictionary at startup when there is only one thread. + Calls to dict_load_table() can be done while holding other latches. */ + if (!fix_dict) { + goto skip_validate; + } + + /* We may need to change what is stored in SYS_DATAFILES or + SYS_TABLESPACES or adjust the link file. + Since a failure to update SYS_TABLESPACES or SYS_DATAFILES does + not prevent opening and using the single_table_tablespace either + this time or the next, we do not check the return code or fail + to open the tablespace. But dict_update_filepath() will issue a + warning to the log. */ + if (dict.filepath) { + if (remote.success) { + dict_update_filepath(id, remote.filepath); + } else if (def.success) { + dict_update_filepath(id, def.filepath); + if (link_file_is_bad) { + fil_delete_link_file(tablename); + } + } else if (!link_file_found || link_file_is_bad) { + ut_ad(dict.success); + /* Fix the link file if we got our filepath + from the dictionary but a link file did not + exist or it did not point to a valid file. */ + fil_delete_link_file(tablename); + fil_create_link_file(tablename, dict.filepath); + } + + } else if (remote.success && dict_filepath_same_as_default) { + dict_update_filepath(id, remote.filepath); + + } else if (remote.success && path_in == NULL) { + /* SYS_DATAFILES record for this space ID was not found. */ + dict_insert_tablespace_and_filepath( + id, tablename, remote.filepath, flags); + } + +skip_validate: + if (err != DB_SUCCESS) { + ; // Don't load the tablespace into the cache + } else if (!fil_space_create(tablename, id, flags, FIL_TABLESPACE)) { + err = DB_ERROR; + } else { + /* We do not measure the size of the file, that is why + we pass the 0 below */ + + if (!fil_node_create(remote.success ? remote.filepath : + dict.success ? dict.filepath : + def.filepath, 0, id, FALSE)) { + err = DB_ERROR; + } + } + +cleanup_and_exit: + if (remote.success) { + os_file_close(remote.file); + } + if (remote.filepath) { + mem_free(remote.filepath); + } + if (dict.success) { + os_file_close(dict.file); + } + if (dict.filepath) { + mem_free(dict.filepath); + } + if (def.success) { + os_file_close(def.file); + } + mem_free(def.filepath); + + return(err); +} +#endif /* !UNIV_HOTBACKUP */ + +#ifdef UNIV_HOTBACKUP +/*******************************************************************//** +Allocates a file name for an old version of a single-table tablespace. +The string must be freed by caller with mem_free()! +@return own: file name */ +static +char* +fil_make_ibbackup_old_name( +/*=======================*/ + const char* name) /*!< in: original file name */ +{ + static const char suffix[] = "_ibbackup_old_vers_"; + char* path; + ulint len = strlen(name); + + path = static_cast(mem_alloc(len + (15 + sizeof suffix))); + + memcpy(path, name, len); + memcpy(path + len, suffix, (sizeof suffix) - 1); + ut_sprintf_timestamp_without_extra_chars( + path + len + ((sizeof suffix) - 1)); + return(path); +} +#endif /* UNIV_HOTBACKUP */ + + +/*******************************************************************//** +Determine the space id of the given file descriptor by reading a few +pages from the beginning of the .ibd file. +@return true if space id was successfully identified, or false. */ +static +bool +fil_user_tablespace_find_space_id( +/*==============================*/ + fsp_open_info* fsp) /* in/out: contains file descriptor, which is + used as input. contains space_id, which is + the output */ +{ + bool st; + os_offset_t file_size; + + file_size = os_file_get_size(fsp->file); + + if (file_size == (os_offset_t) -1) { + ib_logf(IB_LOG_LEVEL_ERROR, "Could not get file size: %s", + fsp->filepath); + return(false); + } + + /* Assuming a page size, read the space_id from each page and store it + in a map. Find out which space_id is agreed on by majority of the + pages. Choose that space_id. */ + for (ulint page_size = UNIV_ZIP_SIZE_MIN; + page_size <= UNIV_PAGE_SIZE_MAX; page_size <<= 1) { + + /* map[space_id] = count of pages */ + std::map verify; + + ulint page_count = 64; + ulint valid_pages = 0; + + /* Adjust the number of pages to analyze based on file size */ + while ((page_count * page_size) > file_size) { + --page_count; + } + + ib_logf(IB_LOG_LEVEL_INFO, "Page size:%lu Pages to analyze:" + "%lu", page_size, page_count); + + byte* buf = static_cast(ut_malloc(2*page_size)); + byte* page = static_cast(ut_align(buf, page_size)); + + for (ulint j = 0; j < page_count; ++j) { + + st = os_file_read(fsp->file, page, (j* page_size), page_size, + fsp_flags_is_page_compressed(fsp->flags)); + + if (!st) { + ib_logf(IB_LOG_LEVEL_INFO, + "READ FAIL: page_no:%lu", j); + continue; + } + + bool uncompressed_ok = false; + + /* For uncompressed pages, the page size must be equal + to UNIV_PAGE_SIZE. */ + if (page_size == UNIV_PAGE_SIZE) { + uncompressed_ok = !buf_page_is_corrupted( + false, page, 0); + } + + bool compressed_ok = !buf_page_is_corrupted( + false, page, page_size); + + if (uncompressed_ok || compressed_ok) { + + ulint space_id = mach_read_from_4(page + + FIL_PAGE_SPACE_ID); + + if (space_id > 0) { + ib_logf(IB_LOG_LEVEL_INFO, + "VALID: space:%lu " + "page_no:%lu page_size:%lu", + space_id, j, page_size); + verify[space_id]++; + ++valid_pages; + } + } + } + + ut_free(buf); + + ib_logf(IB_LOG_LEVEL_INFO, "Page size: %lu, Possible space_id " + "count:%lu", page_size, (ulint) verify.size()); + + const ulint pages_corrupted = 3; + for (ulint missed = 0; missed <= pages_corrupted; ++missed) { + + for (std::map::iterator + m = verify.begin(); m != verify.end(); ++m ) { + + ib_logf(IB_LOG_LEVEL_INFO, "space_id:%lu, " + "Number of pages matched: %lu/%lu " + "(%lu)", m->first, m->second, + valid_pages, page_size); + + if (m->second == (valid_pages - missed)) { + + ib_logf(IB_LOG_LEVEL_INFO, + "Chosen space:%lu\n", m->first); + + fsp->id = m->first; + return(true); + } + } + + } + } + + return(false); +} + +/*******************************************************************//** +Finds the given page_no of the given space id from the double write buffer, +and copies it to the corresponding .ibd file. +@return true if copy was successful, or false. */ +bool +fil_user_tablespace_restore_page( +/*==============================*/ + fsp_open_info* fsp, /* in: contains space id and .ibd + file information */ + ulint page_no) /* in: page_no to obtain from double + write buffer */ +{ + bool err; + ulint flags; + ulint zip_size; + ulint page_size; + ulint buflen; + byte* page; + + ib_logf(IB_LOG_LEVEL_INFO, "Restoring page %lu of tablespace %lu", + page_no, fsp->id); + + // find if double write buffer has page_no of given space id + page = recv_sys->dblwr.find_page(fsp->id, page_no); + + if (!page) { + ib_logf(IB_LOG_LEVEL_WARN, "Doublewrite does not have " + "page_no=%lu of space: %lu", page_no, fsp->id); + err = false; + goto out; + } + + flags = mach_read_from_4(FSP_HEADER_OFFSET + FSP_SPACE_FLAGS + page); + zip_size = fsp_flags_get_zip_size(flags); + page_size = fsp_flags_get_page_size(flags); + + ut_ad(page_no == page_get_page_no(page)); + + buflen = zip_size ? zip_size: page_size; + + ib_logf(IB_LOG_LEVEL_INFO, "Writing %lu bytes into file: %s", + buflen, fsp->filepath); + + err = os_file_write(fsp->filepath, fsp->file, page, + (zip_size ? zip_size : page_size) * page_no, + buflen); + + os_file_flush(fsp->file); +out: + return(err); +} + +/********************************************************************//** +Opens an .ibd file and adds the associated single-table tablespace to the +InnoDB fil0fil.cc data structures. +Set fsp->success to TRUE if tablespace is valid, FALSE if not. */ +static +void +fil_validate_single_table_tablespace( +/*=================================*/ + const char* tablename, /*!< in: database/tablename */ + fsp_open_info* fsp) /*!< in/out: tablespace info */ +{ + bool restore_attempted = false; + +check_first_page: + fsp->success = TRUE; + if (const char* check_msg = fil_read_first_page( + fsp->file, FALSE, &fsp->flags, &fsp->id, + &fsp->lsn, &fsp->lsn, ULINT_UNDEFINED)) { + ib_logf(IB_LOG_LEVEL_ERROR, + "%s in tablespace %s (table %s)", + check_msg, fsp->filepath, tablename); + fsp->success = FALSE; + } + + if (!fsp->success) { + if (!restore_attempted) { + if (!fil_user_tablespace_find_space_id(fsp)) { + return; + } + restore_attempted = true; + + if (fsp->id > 0 + && !fil_user_tablespace_restore_page(fsp, 0)) { + return; + } + goto check_first_page; + } + return; + } + + if (fsp->id == ULINT_UNDEFINED || fsp->id == 0) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Tablespace is not sensible;" + " Table: %s Space ID: %lu Filepath: %s\n", + tablename, (ulong) fsp->id, fsp->filepath); + fsp->success = FALSE; + return; + } + + mutex_enter(&fil_system->mutex); + fil_space_t* space = fil_space_get_by_id(fsp->id); + mutex_exit(&fil_system->mutex); + if (space != NULL) { + char* prev_filepath = fil_space_get_first_path(fsp->id); + + ib_logf(IB_LOG_LEVEL_ERROR, + "Attempted to open a previously opened tablespace. " + "Previous tablespace %s uses space ID: %lu at " + "filepath: %s. Cannot open tablespace %s which uses " + "space ID: %lu at filepath: %s", + space->name, (ulong) space->id, prev_filepath, + tablename, (ulong) fsp->id, fsp->filepath); + + mem_free(prev_filepath); + fsp->success = FALSE; + return; + } + + fsp->success = TRUE; +} + + +/********************************************************************//** +Opens an .ibd file and adds the associated single-table tablespace to the +InnoDB fil0fil.cc data structures. */ +static +void +fil_load_single_table_tablespace( +/*=============================*/ + const char* dbname, /*!< in: database name */ + const char* filename) /*!< in: file name (not a path), + including the .ibd or .isl extension */ +{ + char* tablename; + ulint tablename_len; + ulint dbname_len = strlen(dbname); + ulint filename_len = strlen(filename); + fsp_open_info def; + fsp_open_info remote; + os_offset_t size; + fil_space_t* space; + + memset(&def, 0, sizeof(def)); + memset(&remote, 0, sizeof(remote)); + + /* The caller assured that the extension is ".ibd" or ".isl". */ + ut_ad(0 == memcmp(filename + filename_len - 4, ".ibd", 4) + || 0 == memcmp(filename + filename_len - 4, ".isl", 4)); + + /* Build up the tablename in the standard form database/table. */ + tablename = static_cast( + mem_alloc(dbname_len + filename_len + 2)); + sprintf(tablename, "%s/%s", dbname, filename); + tablename_len = strlen(tablename) - strlen(".ibd"); + tablename[tablename_len] = '\0'; + + /* There may be both .ibd and .isl file in the directory. + And it is possible that the .isl file refers to a different + .ibd file. If so, we open and compare them the first time + one of them is sent to this function. So if this table has + already been loaded, there is nothing to do.*/ + mutex_enter(&fil_system->mutex); + space = fil_space_get_by_name(tablename); + if (space) { + mem_free(tablename); + mutex_exit(&fil_system->mutex); + return; + } + mutex_exit(&fil_system->mutex); + + /* Build up the filepath of the .ibd tablespace in the datadir. + This must be freed independent of def.success. */ + def.filepath = fil_make_ibd_name(tablename, false); + +#ifdef __WIN__ +# ifndef UNIV_HOTBACKUP + /* If lower_case_table_names is 0 or 2, then MySQL allows database + directory names with upper case letters. On Windows, all table and + database names in InnoDB are internally always in lower case. Put the + file path to lower case, so that we are consistent with InnoDB's + internal data dictionary. */ + + dict_casedn_str(def.filepath); +# endif /* !UNIV_HOTBACKUP */ +#endif + + /* Check for a link file which locates a remote tablespace. */ + remote.success = fil_open_linked_file( + tablename, &remote.filepath, &remote.file, FALSE); + + /* Read the first page of the remote tablespace */ + if (remote.success) { + fil_validate_single_table_tablespace(tablename, &remote); + if (!remote.success) { + os_file_close(remote.file); + mem_free(remote.filepath); + } + } + + + /* Try to open the tablespace in the datadir. */ + def.file = os_file_create_simple_no_error_handling( + innodb_file_data_key, def.filepath, OS_FILE_OPEN, + OS_FILE_READ_WRITE, &def.success, FALSE); + + /* Read the first page of the remote tablespace */ + if (def.success) { + fil_validate_single_table_tablespace(tablename, &def); + if (!def.success) { + os_file_close(def.file); + } + } + + if (!def.success && !remote.success) { + /* The following call prints an error message */ + os_file_get_last_error(true); + fprintf(stderr, + "InnoDB: Error: could not open single-table" + " tablespace file %s\n", def.filepath); + + if (!strncmp(filename, + tmp_file_prefix, tmp_file_prefix_length)) { + /* Ignore errors for #sql tablespaces. */ + mem_free(tablename); + if (remote.filepath) { + mem_free(remote.filepath); + } + if (def.filepath) { + mem_free(def.filepath); + } + return; + } +no_good_file: + fprintf(stderr, + "InnoDB: We do not continue the crash recovery," + " because the table may become\n" + "InnoDB: corrupt if we cannot apply the log" + " records in the InnoDB log to it.\n" + "InnoDB: To fix the problem and start mysqld:\n" + "InnoDB: 1) If there is a permission problem" + " in the file and mysqld cannot\n" + "InnoDB: open the file, you should" + " modify the permissions.\n" + "InnoDB: 2) If the table is not needed, or you" + " can restore it from a backup,\n" + "InnoDB: then you can remove the .ibd file," + " and InnoDB will do a normal\n" + "InnoDB: crash recovery and ignore that table.\n" + "InnoDB: 3) If the file system or the" + " disk is broken, and you cannot remove\n" + "InnoDB: the .ibd file, you can set" + " innodb_force_recovery > 0 in my.cnf\n" + "InnoDB: and force InnoDB to continue crash" + " recovery here.\n"); +will_not_choose: + mem_free(tablename); + if (remote.filepath) { + mem_free(remote.filepath); + } + if (def.filepath) { + mem_free(def.filepath); + } + + if (srv_force_recovery > 0) { + ib_logf(IB_LOG_LEVEL_INFO, + "innodb_force_recovery was set to %lu. " + "Continuing crash recovery even though we " + "cannot access the .ibd file of this table.", + srv_force_recovery); + return; + } + + exit(1); + } + + if (def.success && remote.success) { + ib_logf(IB_LOG_LEVEL_ERROR, + "Tablespaces for %s have been found in two places;\n" + "Location 1: SpaceID: %lu LSN: %lu File: %s\n" + "Location 2: SpaceID: %lu LSN: %lu File: %s\n" + "You must delete one of them.", + tablename, (ulong) def.id, (ulong) def.lsn, + def.filepath, (ulong) remote.id, (ulong) remote.lsn, + remote.filepath); + + def.success = FALSE; + os_file_close(def.file); + os_file_close(remote.file); + goto will_not_choose; + } + + /* At this point, only one tablespace is open */ + ut_a(def.success == !remote.success); + + fsp_open_info* fsp = def.success ? &def : &remote; + + /* Get and test the file size. */ + size = os_file_get_size(fsp->file); + + if (size == (os_offset_t) -1) { + /* The following call prints an error message */ + os_file_get_last_error(true); + + ib_logf(IB_LOG_LEVEL_ERROR, + "could not measure the size of single-table " + "tablespace file %s", fsp->filepath); + + os_file_close(fsp->file); + goto no_good_file; + } + + /* Every .ibd file is created >= 4 pages in size. Smaller files + cannot be ok. */ + ulong minimum_size = FIL_IBD_FILE_INITIAL_SIZE * UNIV_PAGE_SIZE; + if (size < minimum_size) { +#ifndef UNIV_HOTBACKUP + ib_logf(IB_LOG_LEVEL_ERROR, + "The size of single-table tablespace file %s " + "is only " UINT64PF ", should be at least %lu!", + fsp->filepath, size, minimum_size); + os_file_close(fsp->file); + goto no_good_file; +#else + fsp->id = ULINT_UNDEFINED; + fsp->flags = 0; +#endif /* !UNIV_HOTBACKUP */ + } + +#ifdef UNIV_HOTBACKUP + if (fsp->id == ULINT_UNDEFINED || fsp->id == 0) { + char* new_path; + + fprintf(stderr, + "InnoDB: Renaming tablespace %s of id %lu,\n" + "InnoDB: to %s_ibbackup_old_vers_\n" + "InnoDB: because its size %" PRId64 " is too small" + " (< 4 pages 16 kB each),\n" + "InnoDB: or the space id in the file header" + " is not sensible.\n" + "InnoDB: This can happen in an ibbackup run," + " and is not dangerous.\n", + fsp->filepath, fsp->id, fsp->filepath, size); + os_file_close(fsp->file); + + new_path = fil_make_ibbackup_old_name(fsp->filepath); + + bool success = os_file_rename( + innodb_file_data_key, fsp->filepath, new_path); + + ut_a(success); + + mem_free(new_path); + + goto func_exit_after_close; + } + + /* A backup may contain the same space several times, if the space got + renamed at a sensitive time. Since it is enough to have one version of + the space, we rename the file if a space with the same space id + already exists in the tablespace memory cache. We rather rename the + file than delete it, because if there is a bug, we do not want to + destroy valuable data. */ + + mutex_enter(&fil_system->mutex); + + space = fil_space_get_by_id(fsp->id); + + if (space) { + char* new_path; + + fprintf(stderr, + "InnoDB: Renaming tablespace %s of id %lu,\n" + "InnoDB: to %s_ibbackup_old_vers_\n" + "InnoDB: because space %s with the same id\n" + "InnoDB: was scanned earlier. This can happen" + " if you have renamed tables\n" + "InnoDB: during an ibbackup run.\n", + fsp->filepath, fsp->id, fsp->filepath, + space->name); + os_file_close(fsp->file); + + new_path = fil_make_ibbackup_old_name(fsp->filepath); + + mutex_exit(&fil_system->mutex); + + bool success = os_file_rename( + innodb_file_data_key, fsp->filepath, new_path); + + ut_a(success); + + mem_free(new_path); + + goto func_exit_after_close; + } + mutex_exit(&fil_system->mutex); +#endif /* UNIV_HOTBACKUP */ + ibool file_space_create_success = fil_space_create( + tablename, fsp->id, fsp->flags, FIL_TABLESPACE); + + if (!file_space_create_success) { + if (srv_force_recovery > 0) { + fprintf(stderr, + "InnoDB: innodb_force_recovery was set" + " to %lu. Continuing crash recovery\n" + "InnoDB: even though the tablespace" + " creation of this table failed.\n", + srv_force_recovery); + goto func_exit; + } + + /* Exit here with a core dump, stack, etc. */ + ut_a(file_space_create_success); + } + + /* We do not use the size information we have about the file, because + the rounding formula for extents and pages is somewhat complex; we + let fil_node_open() do that task. */ + + if (!fil_node_create(fsp->filepath, 0, fsp->id, FALSE)) { + ut_error; + } + +func_exit: + os_file_close(fsp->file); + +#ifdef UNIV_HOTBACKUP +func_exit_after_close: +#else + ut_ad(!mutex_own(&fil_system->mutex)); +#endif + mem_free(tablename); + if (remote.success) { + mem_free(remote.filepath); + } + mem_free(def.filepath); +} + +/***********************************************************************//** +A fault-tolerant function that tries to read the next file name in the +directory. We retry 100 times if os_file_readdir_next_file() returns -1. The +idea is to read as much good data as we can and jump over bad data. +@return 0 if ok, -1 if error even after the retries, 1 if at the end +of the directory */ +static +int +fil_file_readdir_next_file( +/*=======================*/ + dberr_t* err, /*!< out: this is set to DB_ERROR if an error + was encountered, otherwise not changed */ + const char* dirname,/*!< in: directory name or path */ + os_file_dir_t dir, /*!< in: directory stream */ + os_file_stat_t* info) /*!< in/out: buffer where the + info is returned */ +{ + for (ulint i = 0; i < 100; i++) { + int ret = os_file_readdir_next_file(dirname, dir, info); + + if (ret != -1) { + + return(ret); + } + + ib_logf(IB_LOG_LEVEL_ERROR, + "os_file_readdir_next_file() returned -1 in " + "directory %s, crash recovery may have failed " + "for some .ibd files!", dirname); + + *err = DB_ERROR; + } + + return(-1); +} + +#define CHECK_TIME_EVERY_N_FILES 10 +/********************************************************************//** +At the server startup, if we need crash recovery, scans the database +directories under the MySQL datadir, looking for .ibd files. Those files are +single-table tablespaces. We need to know the space id in each of them so that +we know into which file we should look to check the contents of a page stored +in the doublewrite buffer, also to know where to apply log records where the +space id is != 0. +@return DB_SUCCESS or error number */ +UNIV_INTERN +dberr_t +fil_load_single_table_tablespaces(void) +/*===================================*/ +{ + int ret; + char* dbpath = NULL; + ulint dbpath_len = 100; + ulint files_read = 0; + ulint files_read_at_last_check = 0; + ib_time_t prev_report_time = ut_time(); + os_file_dir_t dir; + os_file_dir_t dbdir; + os_file_stat_t dbinfo; + os_file_stat_t fileinfo; + dberr_t err = DB_SUCCESS; + + /* The datadir of MySQL is always the default directory of mysqld */ + + dir = os_file_opendir(fil_path_to_mysql_datadir, TRUE); + + if (dir == NULL) { + + return(DB_ERROR); + } + + dbpath = static_cast(mem_alloc(dbpath_len)); + + /* Scan all directories under the datadir. They are the database + directories of MySQL. */ + + ret = fil_file_readdir_next_file(&err, fil_path_to_mysql_datadir, dir, + &dbinfo); + while (ret == 0) { + ulint len; + /* printf("Looking at %s in datadir\n", dbinfo.name); */ + + if (dbinfo.type == OS_FILE_TYPE_FILE + || dbinfo.type == OS_FILE_TYPE_UNKNOWN) { + + goto next_datadir_item; + } + + /* We found a symlink or a directory; try opening it to see + if a symlink is a directory */ + + len = strlen(fil_path_to_mysql_datadir) + + strlen (dbinfo.name) + 2; + if (len > dbpath_len) { + dbpath_len = len; + + if (dbpath) { + mem_free(dbpath); + } + + dbpath = static_cast(mem_alloc(dbpath_len)); + } + ut_snprintf(dbpath, dbpath_len, + "%s/%s", fil_path_to_mysql_datadir, dbinfo.name); + srv_normalize_path_for_win(dbpath); + + dbdir = os_file_opendir(dbpath, FALSE); + + if (dbdir != NULL) { + + /* We found a database directory; loop through it, + looking for possible .ibd files in it */ + + ret = fil_file_readdir_next_file(&err, dbpath, dbdir, + &fileinfo); + while (ret == 0) { + + if (fileinfo.type == OS_FILE_TYPE_DIR) { + + goto next_file_item; + } + + /* We found a symlink or a file */ + if (strlen(fileinfo.name) > 4 + && (0 == strcmp(fileinfo.name + + strlen(fileinfo.name) - 4, + ".ibd") + || 0 == strcmp(fileinfo.name + + strlen(fileinfo.name) - 4, + ".isl"))) { + /* The name ends in .ibd or .isl; + try opening the file */ + fil_load_single_table_tablespace( + dbinfo.name, fileinfo.name); + files_read++; + if (files_read - files_read_at_last_check > + CHECK_TIME_EVERY_N_FILES) { + ib_time_t cur_time= ut_time(); + files_read_at_last_check= files_read; + double time_elapsed= ut_difftime(cur_time, + prev_report_time); + if (time_elapsed > 15) { + ib_logf(IB_LOG_LEVEL_INFO, + "Processed %ld .ibd/.isl files", + files_read); + prev_report_time= cur_time; + } + } + } +next_file_item: + ret = fil_file_readdir_next_file(&err, + dbpath, dbdir, + &fileinfo); + } + + if (0 != os_file_closedir(dbdir)) { + fputs("InnoDB: Warning: could not" + " close database directory ", stderr); + ut_print_filename(stderr, dbpath); + putc('\n', stderr); + + err = DB_ERROR; + } + } + +next_datadir_item: + ret = fil_file_readdir_next_file(&err, + fil_path_to_mysql_datadir, + dir, &dbinfo); + } + + mem_free(dbpath); + + if (0 != os_file_closedir(dir)) { + fprintf(stderr, + "InnoDB: Error: could not close MySQL datadir\n"); + + return(DB_ERROR); + } + + return(err); +} + +/*******************************************************************//** +Returns TRUE if a single-table tablespace does not exist in the memory cache, +or is being deleted there. +@return TRUE if does not exist or is being deleted */ +UNIV_INTERN +ibool +fil_tablespace_deleted_or_being_deleted_in_mem( +/*===========================================*/ + ulint id, /*!< in: space id */ + ib_int64_t version)/*!< in: tablespace_version should be this; if + you pass -1 as the value of this, then this + parameter is ignored */ +{ + fil_space_t* space; + + ut_ad(fil_system); + + mutex_enter(&fil_system->mutex); + + space = fil_space_get_by_id(id); + + if (space == NULL || space->stop_new_ops) { + mutex_exit(&fil_system->mutex); + + return(TRUE); + } + + if (version != ((ib_int64_t)-1) + && space->tablespace_version != version) { + mutex_exit(&fil_system->mutex); + + return(TRUE); + } + + mutex_exit(&fil_system->mutex); + + return(FALSE); +} + +/*******************************************************************//** +Returns TRUE if a single-table tablespace exists in the memory cache. +@return TRUE if exists */ +UNIV_INTERN +ibool +fil_tablespace_exists_in_mem( +/*=========================*/ + ulint id) /*!< in: space id */ +{ + fil_space_t* space; + + ut_ad(fil_system); + + mutex_enter(&fil_system->mutex); + + space = fil_space_get_by_id(id); + + mutex_exit(&fil_system->mutex); + + return(space != NULL); +} + +/*******************************************************************//** +Report that a tablespace for a table was not found. */ +static +void +fil_report_missing_tablespace( +/*===========================*/ + const char* name, /*!< in: table name */ + ulint space_id) /*!< in: table's space id */ +{ + char index_name[MAX_FULL_NAME_LEN + 1]; + + innobase_format_name(index_name, sizeof(index_name), name, TRUE); + + ib_logf(IB_LOG_LEVEL_ERROR, + "Table %s in the InnoDB data dictionary has tablespace id %lu, " + "but tablespace with that id or name does not exist. Have " + "you deleted or moved .ibd files? This may also be a table " + "created with CREATE TEMPORARY TABLE whose .ibd and .frm " + "files MySQL automatically removed, but the table still " + "exists in the InnoDB internal data dictionary.", + name, space_id); +} + +/*******************************************************************//** +Returns TRUE if a matching tablespace exists in the InnoDB tablespace memory +cache. Note that if we have not done a crash recovery at the database startup, +there may be many tablespaces which are not yet in the memory cache. +@return TRUE if a matching tablespace exists in the memory cache */ +UNIV_INTERN +ibool +fil_space_for_table_exists_in_mem( +/*==============================*/ + ulint id, /*!< in: space id */ + const char* name, /*!< in: table name used in + fil_space_create(). Either the + standard 'dbname/tablename' format + or table->dir_path_of_temp_table */ + ibool mark_space, /*!< in: in crash recovery, at database + startup we mark all spaces which have + an associated table in the InnoDB + data dictionary, so that + we can print a warning about orphaned + tablespaces */ + ibool print_error_if_does_not_exist, + /*!< in: print detailed error + information to the .err log if a + matching tablespace is not found from + memory */ + bool adjust_space, /*!< in: whether to adjust space id + when find table space mismatch */ + mem_heap_t* heap, /*!< in: heap memory */ + table_id_t table_id) /*!< in: table id */ +{ + fil_space_t* fnamespace; + fil_space_t* space; + + ut_ad(fil_system); + + mutex_enter(&fil_system->mutex); + + /* Look if there is a space with the same id */ + + space = fil_space_get_by_id(id); + + /* Look if there is a space with the same name; the name is the + directory path from the datadir to the file */ + + fnamespace = fil_space_get_by_name(name); + if (space && space == fnamespace) { + /* Found */ + + if (mark_space) { + space->mark = TRUE; + } + + mutex_exit(&fil_system->mutex); + + return(TRUE); + } + + /* Info from "fnamespace" comes from the ibd file itself, it can + be different from data obtained from System tables since it is + not transactional. If adjust_space is set, and the mismatching + space are between a user table and its temp table, we shall + adjust the ibd file name according to system table info */ + if (adjust_space + && space != NULL + && row_is_mysql_tmp_table_name(space->name) + && !row_is_mysql_tmp_table_name(name)) { + + mutex_exit(&fil_system->mutex); + + DBUG_EXECUTE_IF("ib_crash_before_adjust_fil_space", + DBUG_SUICIDE();); + + if (fnamespace) { + char* tmp_name; + + tmp_name = dict_mem_create_temporary_tablename( + heap, name, table_id); + + fil_rename_tablespace(fnamespace->name, fnamespace->id, + tmp_name, NULL); + } + + DBUG_EXECUTE_IF("ib_crash_after_adjust_one_fil_space", + DBUG_SUICIDE();); + + fil_rename_tablespace(space->name, id, name, NULL); + + DBUG_EXECUTE_IF("ib_crash_after_adjust_fil_space", + DBUG_SUICIDE();); + + mutex_enter(&fil_system->mutex); + fnamespace = fil_space_get_by_name(name); + ut_ad(space == fnamespace); + mutex_exit(&fil_system->mutex); + + return(TRUE); + } + + if (!print_error_if_does_not_exist) { + + mutex_exit(&fil_system->mutex); + + return(FALSE); + } + + if (space == NULL) { + if (fnamespace == NULL) { + if (print_error_if_does_not_exist) { + fil_report_missing_tablespace(name, id); + } + } else { + ut_print_timestamp(stderr); + fputs(" InnoDB: Error: table ", stderr); + ut_print_filename(stderr, name); + fprintf(stderr, "\n" + "InnoDB: in InnoDB data dictionary has" + " tablespace id %lu,\n" + "InnoDB: but a tablespace with that id" + " does not exist. There is\n" + "InnoDB: a tablespace of name %s and id %lu," + " though. Have\n" + "InnoDB: you deleted or moved .ibd files?\n", + (ulong) id, fnamespace->name, + (ulong) fnamespace->id); + } +error_exit: + fputs("InnoDB: Please refer to\n" + "InnoDB: " REFMAN "innodb-troubleshooting-datadict.html\n" + "InnoDB: for how to resolve the issue.\n", stderr); + + mutex_exit(&fil_system->mutex); + + return(FALSE); + } + + if (0 != strcmp(space->name, name)) { + ut_print_timestamp(stderr); + fputs(" InnoDB: Error: table ", stderr); + ut_print_filename(stderr, name); + fprintf(stderr, "\n" + "InnoDB: in InnoDB data dictionary has" + " tablespace id %lu,\n" + "InnoDB: but the tablespace with that id" + " has name %s.\n" + "InnoDB: Have you deleted or moved .ibd files?\n", + (ulong) id, space->name); + + if (fnamespace != NULL) { + fputs("InnoDB: There is a tablespace" + " with the right name\n" + "InnoDB: ", stderr); + ut_print_filename(stderr, fnamespace->name); + fprintf(stderr, ", but its id is %lu.\n", + (ulong) fnamespace->id); + } + + goto error_exit; + } + + mutex_exit(&fil_system->mutex); + + return(FALSE); +} + +/*******************************************************************//** +Checks if a single-table tablespace for a given table name exists in the +tablespace memory cache. +@return space id, ULINT_UNDEFINED if not found */ +UNIV_INTERN +ulint +fil_get_space_id_for_table( +/*=======================*/ + const char* tablename) /*!< in: table name in the standard + 'databasename/tablename' format */ +{ + fil_space_t* fnamespace; + ulint id = ULINT_UNDEFINED; + + ut_ad(fil_system); + + mutex_enter(&fil_system->mutex); + + /* Look if there is a space with the same name. */ + + fnamespace = fil_space_get_by_name(tablename); + + if (fnamespace) { + id = fnamespace->id; + } + + mutex_exit(&fil_system->mutex); + + return(id); +} + +/**********************************************************************//** +Tries to extend a data file so that it would accommodate the number of pages +given. The tablespace must be cached in the memory cache. If the space is big +enough already, does nothing. +@return TRUE if success */ +UNIV_INTERN +ibool +fil_extend_space_to_desired_size( +/*=============================*/ + ulint* actual_size, /*!< out: size of the space after extension; + if we ran out of disk space this may be lower + than the desired size */ + ulint space_id, /*!< in: space id */ + ulint size_after_extend)/*!< in: desired size in pages after the + extension; if the current space size is bigger + than this already, the function does nothing */ +{ + fil_node_t* node; + fil_space_t* space; + byte* buf2; + byte* buf; + ulint buf_size; + ulint start_page_no; + ulint file_start_page_no; + ulint page_size; + ulint pages_added; + ibool success; + + ut_ad(!srv_read_only_mode); + +retry: + pages_added = 0; + success = TRUE; + + fil_mutex_enter_and_prepare_for_io(space_id); + + space = fil_space_get_by_id(space_id); + ut_a(space); + + if (space->size >= size_after_extend) { + /* Space already big enough */ + + *actual_size = space->size; + + mutex_exit(&fil_system->mutex); + + return(TRUE); + } + + page_size = fsp_flags_get_zip_size(space->flags); + if (!page_size) { + page_size = UNIV_PAGE_SIZE; + } + + node = UT_LIST_GET_LAST(space->chain); + + if (!node->being_extended) { + /* Mark this node as undergoing extension. This flag + is used by other threads to wait for the extension + opereation to finish. */ + node->being_extended = TRUE; + } else { + /* Another thread is currently extending the file. Wait + for it to finish. + It'd have been better to use event driven mechanism but + the entire module is peppered with polling stuff. */ + mutex_exit(&fil_system->mutex); + os_thread_sleep(100000); + goto retry; + } + + if (!fil_node_prepare_for_io(node, fil_system, space)) { + /* The tablespace data file, such as .ibd file, is missing */ + node->being_extended = false; + mutex_exit(&fil_system->mutex); + + return(false); + } + + /* At this point it is safe to release fil_system mutex. No + other thread can rename, delete or close the file because + we have set the node->being_extended flag. */ + mutex_exit(&fil_system->mutex); + + start_page_no = space->size; + file_start_page_no = space->size - node->size; + +#ifdef HAVE_POSIX_FALLOCATE + if (srv_use_posix_fallocate) { + os_offset_t start_offset = start_page_no * page_size; + os_offset_t n_pages = (size_after_extend - start_page_no); + os_offset_t len = n_pages * page_size; + + if (posix_fallocate(node->handle, start_offset, len) == -1) { + ib_logf(IB_LOG_LEVEL_ERROR, "preallocating file " + "space for file \'%s\' failed. Current size " + INT64PF ", desired size " INT64PF "\n", + node->name, start_offset, len+start_offset); + os_file_handle_error_no_exit(node->name, "posix_fallocate", FALSE, __FILE__, __LINE__); + success = FALSE; + } else { + success = TRUE; + } + + mutex_enter(&fil_system->mutex); + + if (success) { + node->size += n_pages; + space->size += n_pages; + os_has_said_disk_full = FALSE; + } + + /* If posix_fallocate was used to extent the file space + we need to complete the io. Because no actual writes were + dispatched read operation is enough here. Without this + there will be assertion at shutdown indicating that + all IO is not completed. */ + fil_node_complete_io(node, fil_system, OS_FILE_READ); + goto file_extended; + } +#endif + + /* Extend at most 64 pages at a time */ + buf_size = ut_min(64, size_after_extend - start_page_no) * page_size; + buf2 = static_cast(mem_alloc(buf_size + page_size)); + buf = static_cast(ut_align(buf2, page_size)); + + memset(buf, 0, buf_size); + + while (start_page_no < size_after_extend) { + ulint n_pages + = ut_min(buf_size / page_size, + size_after_extend - start_page_no); + + os_offset_t offset + = ((os_offset_t) (start_page_no - file_start_page_no)) + * page_size; +#ifdef UNIV_HOTBACKUP + success = os_file_write(node->name, node->handle, buf, + offset, page_size * n_pages); +#else + success = os_aio(OS_FILE_WRITE, OS_AIO_SYNC, + node->name, node->handle, buf, + offset, page_size * n_pages, + NULL, NULL, space_id, NULL, 0, 0, 0, 0, 0); +#endif /* UNIV_HOTBACKUP */ + if (success) { + os_has_said_disk_full = FALSE; + } else { + /* Let us measure the size of the file to determine + how much we were able to extend it */ + os_offset_t size; + + size = os_file_get_size(node->handle); + ut_a(size != (os_offset_t) -1); + + n_pages = ((ulint) (size / page_size)) + - node->size - pages_added; + + pages_added += n_pages; + break; + } + + start_page_no += n_pages; + pages_added += n_pages; + } + + mem_free(buf2); + + mutex_enter(&fil_system->mutex); + + ut_a(node->being_extended); + + space->size += pages_added; + node->size += pages_added; + + fil_node_complete_io(node, fil_system, OS_FILE_WRITE); + + /* At this point file has been extended */ +file_extended: + + node->being_extended = FALSE; + *actual_size = space->size; + +#ifndef UNIV_HOTBACKUP + if (space_id == 0) { + ulint pages_per_mb = (1024 * 1024) / page_size; + + /* Keep the last data file size info up to date, rounded to + full megabytes */ + + srv_data_file_sizes[srv_n_data_files - 1] + = (node->size / pages_per_mb) * pages_per_mb; + } +#endif /* !UNIV_HOTBACKUP */ + + /* + printf("Extended %s to %lu, actual size %lu pages\n", space->name, + size_after_extend, *actual_size); */ + mutex_exit(&fil_system->mutex); + + fil_flush(space_id); + + return(success); +} + +#ifdef UNIV_HOTBACKUP +/********************************************************************//** +Extends all tablespaces to the size stored in the space header. During the +ibbackup --apply-log phase we extended the spaces on-demand so that log records +could be applied, but that may have left spaces still too small compared to +the size stored in the space header. */ +UNIV_INTERN +void +fil_extend_tablespaces_to_stored_len(void) +/*======================================*/ +{ + fil_space_t* space; + byte* buf; + ulint actual_size; + ulint size_in_header; + dberr_t error; + ibool success; + + buf = mem_alloc(UNIV_PAGE_SIZE); + + mutex_enter(&fil_system->mutex); + + space = UT_LIST_GET_FIRST(fil_system->space_list); + + while (space) { + ut_a(space->purpose == FIL_TABLESPACE); + + mutex_exit(&fil_system->mutex); /* no need to protect with a + mutex, because this is a + single-threaded operation */ + error = fil_read(TRUE, space->id, + fsp_flags_get_zip_size(space->flags), + 0, 0, UNIV_PAGE_SIZE, buf, NULL); + ut_a(error == DB_SUCCESS); + + size_in_header = fsp_get_size_low(buf); + + success = fil_extend_space_to_desired_size( + &actual_size, space->id, size_in_header); + if (!success) { + fprintf(stderr, + "InnoDB: Error: could not extend the" + " tablespace of %s\n" + "InnoDB: to the size stored in header," + " %lu pages;\n" + "InnoDB: size after extension %lu pages\n" + "InnoDB: Check that you have free disk space" + " and retry!\n", + space->name, size_in_header, actual_size); + ut_a(success); + } + + mutex_enter(&fil_system->mutex); + + space = UT_LIST_GET_NEXT(space_list, space); + } + + mutex_exit(&fil_system->mutex); + + mem_free(buf); +} +#endif + +/*========== RESERVE FREE EXTENTS (for a B-tree split, for example) ===*/ + +/*******************************************************************//** +Tries to reserve free extents in a file space. +@return TRUE if succeed */ +UNIV_INTERN +ibool +fil_space_reserve_free_extents( +/*===========================*/ + ulint id, /*!< in: space id */ + ulint n_free_now, /*!< in: number of free extents now */ + ulint n_to_reserve) /*!< in: how many one wants to reserve */ +{ + fil_space_t* space; + ibool success; + + ut_ad(fil_system); + + mutex_enter(&fil_system->mutex); + + space = fil_space_get_by_id(id); + + ut_a(space); + + if (space->n_reserved_extents + n_to_reserve > n_free_now) { + success = FALSE; + } else { + space->n_reserved_extents += n_to_reserve; + success = TRUE; + } + + mutex_exit(&fil_system->mutex); + + return(success); +} + +/*******************************************************************//** +Releases free extents in a file space. */ +UNIV_INTERN +void +fil_space_release_free_extents( +/*===========================*/ + ulint id, /*!< in: space id */ + ulint n_reserved) /*!< in: how many one reserved */ +{ + fil_space_t* space; + + ut_ad(fil_system); + + mutex_enter(&fil_system->mutex); + + space = fil_space_get_by_id(id); + + ut_a(space); + ut_a(space->n_reserved_extents >= n_reserved); + + space->n_reserved_extents -= n_reserved; + + mutex_exit(&fil_system->mutex); +} + +/*******************************************************************//** +Gets the number of reserved extents. If the database is silent, this number +should be zero. */ +UNIV_INTERN +ulint +fil_space_get_n_reserved_extents( +/*=============================*/ + ulint id) /*!< in: space id */ +{ + fil_space_t* space; + ulint n; + + ut_ad(fil_system); + + mutex_enter(&fil_system->mutex); + + space = fil_space_get_by_id(id); + + ut_a(space); + + n = space->n_reserved_extents; + + mutex_exit(&fil_system->mutex); + + return(n); +} + +/*============================ FILE I/O ================================*/ + +/********************************************************************//** +NOTE: you must call fil_mutex_enter_and_prepare_for_io() first! + +Prepares a file node for i/o. Opens the file if it is closed. Updates the +pending i/o's field in the node and the system appropriately. Takes the node +off the LRU list if it is in the LRU list. The caller must hold the fil_sys +mutex. +@return false if the file can't be opened, otherwise true */ +static +bool +fil_node_prepare_for_io( +/*====================*/ + fil_node_t* node, /*!< in: file node */ + fil_system_t* system, /*!< in: tablespace memory cache */ + fil_space_t* space) /*!< in: space */ +{ + ut_ad(node && system && space); + ut_ad(mutex_own(&(system->mutex))); + + if (system->n_open > system->max_n_open + 5) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Warning: open files %lu" + " exceeds the limit %lu\n", + (ulong) system->n_open, + (ulong) system->max_n_open); + } + + if (node->open == FALSE) { + /* File is closed: open it */ + ut_a(node->n_pending == 0); + + if (!fil_node_open_file(node, system, space)) { + return(false); + } + } + + if (node->n_pending == 0 && fil_space_belongs_in_lru(space)) { + /* The node is in the LRU list, remove it */ + + ut_a(UT_LIST_GET_LEN(system->LRU) > 0); + + UT_LIST_REMOVE(LRU, system->LRU, node); + } + + node->n_pending++; + + return(true); +} + +/********************************************************************//** +Updates the data structures when an i/o operation finishes. Updates the +pending i/o's field in the node appropriately. */ +static +void +fil_node_complete_io( +/*=================*/ + fil_node_t* node, /*!< in: file node */ + fil_system_t* system, /*!< in: tablespace memory cache */ + ulint type) /*!< in: OS_FILE_WRITE or OS_FILE_READ; marks + the node as modified if + type == OS_FILE_WRITE */ +{ + ut_ad(node); + ut_ad(system); + ut_ad(mutex_own(&(system->mutex))); + + ut_a(node->n_pending > 0); + + node->n_pending--; + + if (type == OS_FILE_WRITE) { + ut_ad(!srv_read_only_mode); + system->modification_counter++; + node->modification_counter = system->modification_counter; + + if (fil_buffering_disabled(node->space)) { + + /* We don't need to keep track of unflushed + changes as user has explicitly disabled + buffering. */ + ut_ad(!node->space->is_in_unflushed_spaces); + node->flush_counter = node->modification_counter; + + } else if (!node->space->is_in_unflushed_spaces) { + + node->space->is_in_unflushed_spaces = true; + UT_LIST_ADD_FIRST(unflushed_spaces, + system->unflushed_spaces, + node->space); + } + } + + if (node->n_pending == 0 && fil_space_belongs_in_lru(node->space)) { + + /* The node must be put back to the LRU list */ + UT_LIST_ADD_FIRST(LRU, system->LRU, node); + } +} + +/********************************************************************//** +Report information about an invalid page access. */ +static +void +fil_report_invalid_page_access( +/*===========================*/ + ulint block_offset, /*!< in: block offset */ + ulint space_id, /*!< in: space id */ + const char* space_name, /*!< in: space name */ + ulint byte_offset, /*!< in: byte offset */ + ulint len, /*!< in: I/O length */ + ulint type) /*!< in: I/O type */ +{ + fprintf(stderr, + "InnoDB: Error: trying to access page number %lu" + " in space %lu,\n" + "InnoDB: space name %s,\n" + "InnoDB: which is outside the tablespace bounds.\n" + "InnoDB: Byte offset %lu, len %lu, i/o type %lu.\n" + "InnoDB: If you get this error at mysqld startup," + " please check that\n" + "InnoDB: your my.cnf matches the ibdata files" + " that you have in the\n" + "InnoDB: MySQL server.\n", + (ulong) block_offset, (ulong) space_id, space_name, + (ulong) byte_offset, (ulong) len, (ulong) type); +} + +/********************************************************************//** +Reads or writes data. This operation is asynchronous (aio). +@return DB_SUCCESS, or DB_TABLESPACE_DELETED if we are trying to do +i/o on a tablespace which does not exist */ +UNIV_INTERN +dberr_t +_fil_io( +/*===*/ + ulint type, /*!< in: OS_FILE_READ or OS_FILE_WRITE, + ORed to OS_FILE_LOG, if a log i/o + and ORed to OS_AIO_SIMULATED_WAKE_LATER + if simulated aio and we want to post a + batch of i/os; NOTE that a simulated batch + may introduce hidden chances of deadlocks, + because i/os are not actually handled until + all have been posted: use with great + caution! */ + bool sync, /*!< in: true if synchronous aio is desired */ + ulint space_id, /*!< in: space id */ + ulint zip_size, /*!< in: compressed page size in bytes; + 0 for uncompressed pages */ + ulint block_offset, /*!< in: offset in number of blocks */ + ulint byte_offset, /*!< in: remainder of offset in bytes; in + aio this must be divisible by the OS block + size */ + ulint len, /*!< in: how many bytes to read or write; this + must not cross a file boundary; in aio this + must be a block size multiple */ + void* buf, /*!< in/out: buffer where to store read data + or from where to write; in aio this must be + appropriately aligned */ + void* message, /*!< in: message for aio handler if non-sync + aio used, else ignored */ + ulint* write_size, /*!< in/out: Actual write size initialized + after fist successfull trim + operation for this page and if + initialized we do not trim again if + actual page size does not decrease. */ + trx_t* trx) +{ + ulint mode; + fil_space_t* space; + fil_node_t* node; + ibool ret; + ulint is_log; + ulint wake_later; + os_offset_t offset; + ibool ignore_nonexistent_pages; + ibool page_compressed = FALSE; + ulint page_compression_level = 0; + ibool page_encrypted = FALSE; + ulint page_encryption_key = 0; + + + is_log = type & OS_FILE_LOG; + type = type & ~OS_FILE_LOG; + + wake_later = type & OS_AIO_SIMULATED_WAKE_LATER; + type = type & ~OS_AIO_SIMULATED_WAKE_LATER; + + ignore_nonexistent_pages = type & BUF_READ_IGNORE_NONEXISTENT_PAGES; + type &= ~BUF_READ_IGNORE_NONEXISTENT_PAGES; + + ut_ad(byte_offset < UNIV_PAGE_SIZE); + ut_ad(!zip_size || !byte_offset); + ut_ad(ut_is_2pow(zip_size)); + ut_ad(buf); + ut_ad(len > 0); + ut_ad(UNIV_PAGE_SIZE == (ulong)(1 << UNIV_PAGE_SIZE_SHIFT)); +#if (1 << UNIV_PAGE_SIZE_SHIFT_MAX) != UNIV_PAGE_SIZE_MAX +# error "(1 << UNIV_PAGE_SIZE_SHIFT_MAX) != UNIV_PAGE_SIZE_MAX" +#endif +#if (1 << UNIV_PAGE_SIZE_SHIFT_MIN) != UNIV_PAGE_SIZE_MIN +# error "(1 << UNIV_PAGE_SIZE_SHIFT_MIN) != UNIV_PAGE_SIZE_MIN" +#endif + ut_ad(fil_validate_skip()); +#ifndef UNIV_HOTBACKUP +# ifndef UNIV_LOG_DEBUG + /* ibuf bitmap pages must be read in the sync aio mode: */ + ut_ad(recv_no_ibuf_operations + || type == OS_FILE_WRITE + || !ibuf_bitmap_page(zip_size, block_offset) + || sync + || is_log); +# endif /* UNIV_LOG_DEBUG */ + if (sync) { + mode = OS_AIO_SYNC; + } else if (is_log) { + mode = OS_AIO_LOG; + } else if (type == OS_FILE_READ + && !recv_no_ibuf_operations + && ibuf_page(space_id, zip_size, block_offset, NULL)) { + mode = OS_AIO_IBUF; + } else { + mode = OS_AIO_NORMAL; + } +#else /* !UNIV_HOTBACKUP */ + ut_a(sync); + mode = OS_AIO_SYNC; +#endif /* !UNIV_HOTBACKUP */ + + if (type == OS_FILE_READ) { + srv_stats.data_read.add(len); + } else if (type == OS_FILE_WRITE) { + ut_ad(!srv_read_only_mode); + srv_stats.data_written.add(len); + if (fil_page_is_index_page((byte *)buf)) { + srv_stats.index_pages_written.inc(); + } else { + srv_stats.non_index_pages_written.inc(); + } + } + + /* Reserve the fil_system mutex and make sure that we can open at + least one file while holding it, if the file is not already open */ + + fil_mutex_enter_and_prepare_for_io(space_id); + + space = fil_space_get_by_id(space_id); + + page_compressed = fsp_flags_is_page_compressed(space->flags); + page_compression_level = fsp_flags_get_page_compression_level(space->flags); + + page_encrypted = fsp_flags_is_page_encrypted(space->flags); + page_encryption_key = fsp_flags_get_page_encryption_key(space->flags); + + + /* If we are deleting a tablespace we don't allow any read + operations on that. However, we do allow write operations. */ + if (space == 0 || (type == OS_FILE_READ && space->stop_new_ops)) { + mutex_exit(&fil_system->mutex); + + ib_logf(IB_LOG_LEVEL_ERROR, + "Trying to do i/o to a tablespace which does " + "not exist. i/o type %lu, space id %lu, " + "page no. %lu, i/o length %lu bytes", + (ulong) type, (ulong) space_id, (ulong) block_offset, + (ulong) len); + + return(DB_TABLESPACE_DELETED); + } + + ut_ad(mode != OS_AIO_IBUF || space->purpose == FIL_TABLESPACE); + + node = UT_LIST_GET_FIRST(space->chain); + + for (;;) { + if (node == NULL) { + if (ignore_nonexistent_pages) { + mutex_exit(&fil_system->mutex); + return(DB_ERROR); + } + + fil_report_invalid_page_access( + block_offset, space_id, space->name, + byte_offset, len, type); + + ut_error; + + } else if (fil_is_user_tablespace_id(space->id) + && node->size == 0) { + + /* We do not know the size of a single-table tablespace + before we open the file */ + break; + } else if (node->size > block_offset) { + /* Found! */ + break; + } else { + block_offset -= node->size; + node = UT_LIST_GET_NEXT(chain, node); + } + } + + /* Open file if closed */ + if (!fil_node_prepare_for_io(node, fil_system, space)) { + if (space->purpose == FIL_TABLESPACE + && fil_is_user_tablespace_id(space->id)) { + mutex_exit(&fil_system->mutex); + + ib_logf(IB_LOG_LEVEL_ERROR, + "Trying to do i/o to a tablespace which " + "exists without .ibd data file. " + "i/o type %lu, space id %lu, page no %lu, " + "i/o length %lu bytes", + (ulong) type, (ulong) space_id, + (ulong) block_offset, (ulong) len); + + return(DB_TABLESPACE_DELETED); + } + + /* The tablespace is for log. Currently, we just assert here + to prevent handling errors along the way fil_io returns. + Also, if the log files are missing, it would be hard to + promise the server can continue running. */ + ut_a(0); + } + + /* Check that at least the start offset is within the bounds of a + single-table tablespace, including rollback tablespaces. */ + if (UNIV_UNLIKELY(node->size <= block_offset) + && space->id != 0 && space->purpose == FIL_TABLESPACE) { + + fil_report_invalid_page_access( + block_offset, space_id, space->name, byte_offset, + len, type); + + ut_error; + } + + /* Now we have made the changes in the data structures of fil_system */ + mutex_exit(&fil_system->mutex); + + /* Calculate the low 32 bits and the high 32 bits of the file offset */ + + if (!zip_size) { + offset = ((os_offset_t) block_offset << UNIV_PAGE_SIZE_SHIFT) + + byte_offset; + + ut_a(node->size - block_offset + >= ((byte_offset + len + (UNIV_PAGE_SIZE - 1)) + / UNIV_PAGE_SIZE)); + } else { + ulint zip_size_shift; + switch (zip_size) { + case 1024: zip_size_shift = 10; break; + case 2048: zip_size_shift = 11; break; + case 4096: zip_size_shift = 12; break; + case 8192: zip_size_shift = 13; break; + case 16384: zip_size_shift = 14; break; + default: ut_error; + } + offset = ((os_offset_t) block_offset << zip_size_shift) + + byte_offset; + ut_a(node->size - block_offset + >= (len + (zip_size - 1)) / zip_size); + } + + /* Do aio */ + + ut_a(byte_offset % OS_MIN_LOG_BLOCK_SIZE == 0); + ut_a((len % OS_MIN_LOG_BLOCK_SIZE) == 0); + +#ifndef UNIV_HOTBACKUP + if (UNIV_UNLIKELY(space->is_corrupt && srv_pass_corrupt_table)) { + + /* should ignore i/o for the crashed space */ + if (srv_pass_corrupt_table == 1 || + type == OS_FILE_WRITE) { + + mutex_enter(&fil_system->mutex); + fil_node_complete_io(node, fil_system, type); + mutex_exit(&fil_system->mutex); + if (mode == OS_AIO_NORMAL) { + ut_a(space->purpose == FIL_TABLESPACE); + buf_page_io_complete(static_cast + (message)); + } + } + + if (srv_pass_corrupt_table == 1 && type == OS_FILE_READ) { + + return(DB_TABLESPACE_DELETED); + + } else if (type == OS_FILE_WRITE) { + + return(DB_SUCCESS); + } + } + + /* Queue the aio request */ + ret = os_aio(type, mode | wake_later, node->name, node->handle, buf, + offset, len, node, message, space_id, trx, page_compressed, page_compression_level, write_size, page_encrypted, page_encryption_key); + +#else + /* In ibbackup do normal i/o, not aio */ + if (type == OS_FILE_READ) { + ret = os_file_read(node->handle, buf, offset, len); + } else { + ut_ad(!srv_read_only_mode); + ret = os_file_write(node->name, node->handle, buf, + offset, len); + } +#endif /* !UNIV_HOTBACKUP */ + ut_a(ret); + + if (mode == OS_AIO_SYNC) { + /* The i/o operation is already completed when we return from + os_aio: */ + + mutex_enter(&fil_system->mutex); + + fil_node_complete_io(node, fil_system, type); + + mutex_exit(&fil_system->mutex); + + ut_ad(fil_validate_skip()); + } + + return(DB_SUCCESS); +} + +#ifndef UNIV_HOTBACKUP +/**********************************************************************//** +Waits for an aio operation to complete. This function is used to write the +handler for completed requests. The aio array of pending requests is divided +into segments (see os0file.cc for more info). The thread specifies which +segment it wants to wait for. */ +UNIV_INTERN +void +fil_aio_wait( +/*=========*/ + ulint segment) /*!< in: the number of the segment in the aio + array to wait for */ +{ + ibool ret; + fil_node_t* fil_node; + void* message; + ulint type; + ulint space_id = 0; + + ut_ad(fil_validate_skip()); + + if (srv_use_native_aio) { + srv_set_io_thread_op_info(segment, "native aio handle"); +#ifdef WIN_ASYNC_IO + ret = os_aio_windows_handle( + segment, 0, &fil_node, &message, &type, &space_id); +#elif defined(LINUX_NATIVE_AIO) + ret = os_aio_linux_handle( + segment, &fil_node, &message, &type, &space_id); +#else + ut_error; + ret = 0; /* Eliminate compiler warning */ +#endif /* WIN_ASYNC_IO */ + } else { + srv_set_io_thread_op_info(segment, "simulated aio handle"); + + ret = os_aio_simulated_handle( + segment, &fil_node, &message, &type, &space_id); + } + + ut_a(ret); + if (fil_node == NULL) { + ut_ad(srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS); + return; + } + + srv_set_io_thread_op_info(segment, "complete io for fil node"); + + mutex_enter(&fil_system->mutex); + + fil_node_complete_io(fil_node, fil_system, type); + + mutex_exit(&fil_system->mutex); + + ut_ad(fil_validate_skip()); + + /* Do the i/o handling */ + /* IMPORTANT: since i/o handling for reads will read also the insert + buffer in tablespace 0, you have to be very careful not to introduce + deadlocks in the i/o system. We keep tablespace 0 data files always + open, and use a special i/o thread to serve insert buffer requests. */ + + if (fil_node->space->purpose == FIL_TABLESPACE) { + srv_set_io_thread_op_info(segment, "complete io for buf page"); + buf_page_io_complete(static_cast(message)); + } else { + srv_set_io_thread_op_info(segment, "complete io for log"); + log_io_complete(static_cast(message)); + } +} +#endif /* UNIV_HOTBACKUP */ + +/**********************************************************************//** +Flushes to disk possible writes cached by the OS. If the space does not exist +or is being dropped, does not do anything. */ +UNIV_INTERN +void +fil_flush( +/*======*/ + ulint space_id) /*!< in: file space id (this can be a group of + log files or a tablespace of the database) */ +{ + fil_space_t* space; + fil_node_t* node; + os_file_t file; + + + mutex_enter(&fil_system->mutex); + + space = fil_space_get_by_id(space_id); + + if (!space || space->stop_new_ops) { + mutex_exit(&fil_system->mutex); + + return; + } + + if (fil_buffering_disabled(space)) { + + /* No need to flush. User has explicitly disabled + buffering. */ + ut_ad(!space->is_in_unflushed_spaces); + ut_ad(fil_space_is_flushed(space)); + ut_ad(space->n_pending_flushes == 0); + +#ifdef UNIV_DEBUG + for (node = UT_LIST_GET_FIRST(space->chain); + node != NULL; + node = UT_LIST_GET_NEXT(chain, node)) { + ut_ad(node->modification_counter + == node->flush_counter); + ut_ad(node->n_pending_flushes == 0); + } +#endif /* UNIV_DEBUG */ + + mutex_exit(&fil_system->mutex); + return; + } + + space->n_pending_flushes++; /*!< prevent dropping of the space while + we are flushing */ + for (node = UT_LIST_GET_FIRST(space->chain); + node != NULL; + node = UT_LIST_GET_NEXT(chain, node)) { + + ib_int64_t old_mod_counter = node->modification_counter;; + + if (old_mod_counter <= node->flush_counter) { + continue; + } + + ut_a(node->open); + + if (space->purpose == FIL_TABLESPACE) { + fil_n_pending_tablespace_flushes++; + } else { + fil_n_pending_log_flushes++; + fil_n_log_flushes++; + } +#ifdef __WIN__ + if (node->is_raw_disk) { + + goto skip_flush; + } +#endif /* __WIN__ */ +retry: + if (node->n_pending_flushes > 0) { + /* We want to avoid calling os_file_flush() on + the file twice at the same time, because we do + not know what bugs OS's may contain in file + i/o */ + + ib_int64_t sig_count = + os_event_reset(node->sync_event); + + mutex_exit(&fil_system->mutex); + + os_event_wait_low(node->sync_event, sig_count); + + mutex_enter(&fil_system->mutex); + + if (node->flush_counter >= old_mod_counter) { + + goto skip_flush; + } + + goto retry; + } + + ut_a(node->open); + file = node->handle; + node->n_pending_flushes++; + + mutex_exit(&fil_system->mutex); + + os_file_flush(file); + + mutex_enter(&fil_system->mutex); + + os_event_set(node->sync_event); + + node->n_pending_flushes--; +skip_flush: + if (node->flush_counter < old_mod_counter) { + node->flush_counter = old_mod_counter; + + if (space->is_in_unflushed_spaces + && fil_space_is_flushed(space)) { + + space->is_in_unflushed_spaces = false; + + UT_LIST_REMOVE( + unflushed_spaces, + fil_system->unflushed_spaces, + space); + } + } + + if (space->purpose == FIL_TABLESPACE) { + fil_n_pending_tablespace_flushes--; + } else { + fil_n_pending_log_flushes--; + } + } + + space->n_pending_flushes--; + + mutex_exit(&fil_system->mutex); +} + +/**********************************************************************//** +Flushes to disk the writes in file spaces of the given type possibly cached by +the OS. */ +UNIV_INTERN +void +fil_flush_file_spaces( +/*==================*/ + ulint purpose) /*!< in: FIL_TABLESPACE, FIL_LOG */ +{ + fil_space_t* space; + ulint* space_ids; + ulint n_space_ids; + ulint i; + + mutex_enter(&fil_system->mutex); + + n_space_ids = UT_LIST_GET_LEN(fil_system->unflushed_spaces); + if (n_space_ids == 0) { + + mutex_exit(&fil_system->mutex); + return; + } + + /* Assemble a list of space ids to flush. Previously, we + traversed fil_system->unflushed_spaces and called UT_LIST_GET_NEXT() + on a space that was just removed from the list by fil_flush(). + Thus, the space could be dropped and the memory overwritten. */ + space_ids = static_cast( + mem_alloc(n_space_ids * sizeof *space_ids)); + + n_space_ids = 0; + + for (space = UT_LIST_GET_FIRST(fil_system->unflushed_spaces); + space; + space = UT_LIST_GET_NEXT(unflushed_spaces, space)) { + + if (space->purpose == purpose && !space->stop_new_ops) { + + space_ids[n_space_ids++] = space->id; + } + } + + mutex_exit(&fil_system->mutex); + + /* Flush the spaces. It will not hurt to call fil_flush() on + a non-existing space id. */ + for (i = 0; i < n_space_ids; i++) { + + fil_flush(space_ids[i]); + } + + mem_free(space_ids); +} + +/** Functor to validate the space list. */ +struct Check { + void operator()(const fil_node_t* elem) + { + ut_a(elem->open || !elem->n_pending); + } +}; + +/******************************************************************//** +Checks the consistency of the tablespace cache. +@return TRUE if ok */ +UNIV_INTERN +ibool +fil_validate(void) +/*==============*/ +{ + fil_space_t* space; + fil_node_t* fil_node; + ulint n_open = 0; + ulint i; + + mutex_enter(&fil_system->mutex); + + /* Look for spaces in the hash table */ + + for (i = 0; i < hash_get_n_cells(fil_system->spaces); i++) { + + for (space = static_cast( + HASH_GET_FIRST(fil_system->spaces, i)); + space != 0; + space = static_cast( + HASH_GET_NEXT(hash, space))) { + + UT_LIST_VALIDATE( + chain, fil_node_t, space->chain, Check()); + + for (fil_node = UT_LIST_GET_FIRST(space->chain); + fil_node != 0; + fil_node = UT_LIST_GET_NEXT(chain, fil_node)) { + + if (fil_node->n_pending > 0) { + ut_a(fil_node->open); + } + + if (fil_node->open) { + n_open++; + } + } + } + } + + ut_a(fil_system->n_open == n_open); + + UT_LIST_CHECK(LRU, fil_node_t, fil_system->LRU); + + for (fil_node = UT_LIST_GET_FIRST(fil_system->LRU); + fil_node != 0; + fil_node = UT_LIST_GET_NEXT(LRU, fil_node)) { + + ut_a(fil_node->n_pending == 0); + ut_a(!fil_node->being_extended); + ut_a(fil_node->open); + ut_a(fil_space_belongs_in_lru(fil_node->space)); + } + + mutex_exit(&fil_system->mutex); + + return(TRUE); +} + +/********************************************************************//** +Returns TRUE if file address is undefined. +@return TRUE if undefined */ +UNIV_INTERN +ibool +fil_addr_is_null( +/*=============*/ + fil_addr_t addr) /*!< in: address */ +{ + return(addr.page == FIL_NULL); +} + +/********************************************************************//** +Get the predecessor of a file page. +@return FIL_PAGE_PREV */ +UNIV_INTERN +ulint +fil_page_get_prev( +/*==============*/ + const byte* page) /*!< in: file page */ +{ + return(mach_read_from_4(page + FIL_PAGE_PREV)); +} + +/********************************************************************//** +Get the successor of a file page. +@return FIL_PAGE_NEXT */ +UNIV_INTERN +ulint +fil_page_get_next( +/*==============*/ + const byte* page) /*!< in: file page */ +{ + return(mach_read_from_4(page + FIL_PAGE_NEXT)); +} + +/*********************************************************************//** +Sets the file page type. */ +UNIV_INTERN +void +fil_page_set_type( +/*==============*/ + byte* page, /*!< in/out: file page */ + ulint type) /*!< in: type */ +{ + ut_ad(page); + + mach_write_to_2(page + FIL_PAGE_TYPE, type); +} + +/*********************************************************************//** +Gets the file page type. +@return type; NOTE that if the type has not been written to page, the +return value not defined */ +UNIV_INTERN +ulint +fil_page_get_type( +/*==============*/ + const byte* page) /*!< in: file page */ +{ + ut_ad(page); + + return(mach_read_from_2(page + FIL_PAGE_TYPE)); +} + +/****************************************************************//** +Closes the tablespace memory cache. */ +UNIV_INTERN +void +fil_close(void) +/*===========*/ +{ +#ifndef UNIV_HOTBACKUP + /* The mutex should already have been freed. */ + ut_ad(fil_system->mutex.magic_n == 0); +#endif /* !UNIV_HOTBACKUP */ + + hash_table_free(fil_system->spaces); + + hash_table_free(fil_system->name_hash); + + ut_a(UT_LIST_GET_LEN(fil_system->LRU) == 0); + ut_a(UT_LIST_GET_LEN(fil_system->unflushed_spaces) == 0); + ut_a(UT_LIST_GET_LEN(fil_system->space_list) == 0); + + mem_free(fil_system); + + fil_system = NULL; +} + +/********************************************************************//** +Initializes a buffer control block when the buf_pool is created. */ +static +void +fil_buf_block_init( +/*===============*/ + buf_block_t* block, /*!< in: pointer to control block */ + byte* frame) /*!< in: pointer to buffer frame */ +{ + UNIV_MEM_DESC(frame, UNIV_PAGE_SIZE); + + block->frame = frame; + + block->page.io_fix = BUF_IO_NONE; + /* There are assertions that check for this. */ + block->page.buf_fix_count = 1; + block->page.state = BUF_BLOCK_READY_FOR_USE; + + page_zip_des_init(&block->page.zip); +} + +struct fil_iterator_t { + os_file_t file; /*!< File handle */ + const char* filepath; /*!< File path name */ + os_offset_t start; /*!< From where to start */ + os_offset_t end; /*!< Where to stop */ + os_offset_t file_size; /*!< File size in bytes */ + ulint page_size; /*!< Page size */ + ulint n_io_buffers; /*!< Number of pages to use + for IO */ + byte* io_buffer; /*!< Buffer to use for IO */ +}; + +/********************************************************************//** +TODO: This can be made parallel trivially by chunking up the file and creating +a callback per thread. . Main benefit will be to use multiple CPUs for +checksums and compressed tables. We have to do compressed tables block by +block right now. Secondly we need to decompress/compress and copy too much +of data. These are CPU intensive. + +Iterate over all the pages in the tablespace. +@param iter - Tablespace iterator +@param block - block to use for IO +@param callback - Callback to inspect and update page contents +@retval DB_SUCCESS or error code */ +static +dberr_t +fil_iterate( +/*========*/ + const fil_iterator_t& iter, + buf_block_t* block, + PageCallback& callback) +{ + os_offset_t offset; + ulint page_no = 0; + ulint space_id = callback.get_space_id(); + ulint n_bytes = iter.n_io_buffers * iter.page_size; + + ut_ad(!srv_read_only_mode); + + /* TODO: For compressed tables we do a lot of useless + copying for non-index pages. Unfortunately, it is + required by buf_zip_decompress() */ + + for (offset = iter.start; offset < iter.end; offset += n_bytes) { + + byte* io_buffer = iter.io_buffer; + + block->frame = io_buffer; + + if (callback.get_zip_size() > 0) { + page_zip_des_init(&block->page.zip); + page_zip_set_size(&block->page.zip, iter.page_size); + block->page.zip.data = block->frame + UNIV_PAGE_SIZE; + ut_d(block->page.zip.m_external = true); + ut_ad(iter.page_size == callback.get_zip_size()); + + /* Zip IO is done in the compressed page buffer. */ + io_buffer = block->page.zip.data; + } else { + io_buffer = iter.io_buffer; + } + + /* We have to read the exact number of bytes. Otherwise the + InnoDB IO functions croak on failed reads. */ + + n_bytes = static_cast( + ut_min(static_cast(n_bytes), + iter.end - offset)); + + ut_ad(n_bytes > 0); + ut_ad(!(n_bytes % iter.page_size)); + + if (!os_file_read(iter.file, io_buffer, offset, + (ulint) n_bytes, + fil_space_is_page_compressed(space_id))) { + + ib_logf(IB_LOG_LEVEL_ERROR, "os_file_read() failed"); + + return(DB_IO_ERROR); + } + + bool updated = false; + os_offset_t page_off = offset; + ulint n_pages_read = (ulint) n_bytes / iter.page_size; + + for (ulint i = 0; i < n_pages_read; ++i) { + + buf_block_set_file_page(block, space_id, page_no++); + + dberr_t err; + + if ((err = callback(page_off, block)) != DB_SUCCESS) { + + return(err); + + } else if (!updated) { + updated = buf_block_get_state(block) + == BUF_BLOCK_FILE_PAGE; + } + + buf_block_set_state(block, BUF_BLOCK_NOT_USED); + buf_block_set_state(block, BUF_BLOCK_READY_FOR_USE); + + page_off += iter.page_size; + block->frame += iter.page_size; + } + + /* A page was updated in the set, write back to disk. */ + if (updated + && !os_file_write( + iter.filepath, iter.file, io_buffer, + offset, (ulint) n_bytes)) { + + ib_logf(IB_LOG_LEVEL_ERROR, "os_file_write() failed"); + + return(DB_IO_ERROR); + } + } + + return(DB_SUCCESS); +} + +/********************************************************************//** +Iterate over all the pages in the tablespace. +@param table - the table definiton in the server +@param n_io_buffers - number of blocks to read and write together +@param callback - functor that will do the page updates +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +fil_tablespace_iterate( +/*===================*/ + dict_table_t* table, + ulint n_io_buffers, + PageCallback& callback) +{ + dberr_t err; + os_file_t file; + char* filepath; + + ut_a(n_io_buffers > 0); + ut_ad(!srv_read_only_mode); + + DBUG_EXECUTE_IF("ib_import_trigger_corruption_1", + return(DB_CORRUPTION);); + + if (DICT_TF_HAS_DATA_DIR(table->flags)) { + dict_get_and_save_data_dir_path(table, false); + ut_a(table->data_dir_path); + + filepath = os_file_make_remote_pathname( + table->data_dir_path, table->name, "ibd"); + } else { + filepath = fil_make_ibd_name(table->name, false); + } + + { + ibool success; + + file = os_file_create_simple_no_error_handling( + innodb_file_data_key, filepath, + OS_FILE_OPEN, OS_FILE_READ_WRITE, &success, FALSE); + + DBUG_EXECUTE_IF("fil_tablespace_iterate_failure", + { + static bool once; + + if (!once || ut_rnd_interval(0, 10) == 5) { + once = true; + success = FALSE; + os_file_close(file); + } + }); + + if (!success) { + /* The following call prints an error message */ + os_file_get_last_error(true); + + ib_logf(IB_LOG_LEVEL_ERROR, + "Trying to import a tablespace, but could not " + "open the tablespace file %s", filepath); + + mem_free(filepath); + + return(DB_TABLESPACE_NOT_FOUND); + + } else { + err = DB_SUCCESS; + } + } + + callback.set_file(filepath, file); + + os_offset_t file_size = os_file_get_size(file); + ut_a(file_size != (os_offset_t) -1); + + /* The block we will use for every physical page */ + buf_block_t block; + + memset(&block, 0x0, sizeof(block)); + + /* Allocate a page to read in the tablespace header, so that we + can determine the page size and zip_size (if it is compressed). + We allocate an extra page in case it is a compressed table. One + page is to ensure alignement. */ + + void* page_ptr = mem_alloc(3 * UNIV_PAGE_SIZE); + byte* page = static_cast(ut_align(page_ptr, UNIV_PAGE_SIZE)); + + fil_buf_block_init(&block, page); + + /* Read the first page and determine the page and zip size. */ + + if (!os_file_read(file, page, 0, UNIV_PAGE_SIZE, + dict_tf_get_page_compression(table->flags))) { + + err = DB_IO_ERROR; + + } else if ((err = callback.init(file_size, &block)) == DB_SUCCESS) { + fil_iterator_t iter; + + iter.file = file; + iter.start = 0; + iter.end = file_size; + iter.filepath = filepath; + iter.file_size = file_size; + iter.n_io_buffers = n_io_buffers; + iter.page_size = callback.get_page_size(); + + /* Compressed pages can't be optimised for block IO for now. + We do the IMPORT page by page. */ + + if (callback.get_zip_size() > 0) { + iter.n_io_buffers = 1; + ut_a(iter.page_size == callback.get_zip_size()); + } + + /** Add an extra page for compressed page scratch area. */ + + void* io_buffer = mem_alloc( + (2 + iter.n_io_buffers) * UNIV_PAGE_SIZE); + + iter.io_buffer = static_cast( + ut_align(io_buffer, UNIV_PAGE_SIZE)); + + err = fil_iterate(iter, &block, callback); + + mem_free(io_buffer); + } + + if (err == DB_SUCCESS) { + + ib_logf(IB_LOG_LEVEL_INFO, "Sync to disk"); + + if (!os_file_flush(file)) { + ib_logf(IB_LOG_LEVEL_INFO, "os_file_flush() failed!"); + err = DB_IO_ERROR; + } else { + ib_logf(IB_LOG_LEVEL_INFO, "Sync to disk - done!"); + } + } + + os_file_close(file); + + mem_free(page_ptr); + mem_free(filepath); + + return(err); +} + +/** +Set the tablespace compressed table size. +@return DB_SUCCESS if it is valie or DB_CORRUPTION if not */ +dberr_t +PageCallback::set_zip_size(const buf_frame_t* page) UNIV_NOTHROW +{ + m_zip_size = fsp_header_get_zip_size(page); + + if (!ut_is_2pow(m_zip_size) || m_zip_size > UNIV_ZIP_SIZE_MAX) { + return(DB_CORRUPTION); + } + + return(DB_SUCCESS); +} + +/********************************************************************//** +Delete the tablespace file and any related files like .cfg. +This should not be called for temporary tables. */ +UNIV_INTERN +void +fil_delete_file( +/*============*/ + const char* ibd_name) /*!< in: filepath of the ibd + tablespace */ +{ + /* Force a delete of any stale .ibd files that are lying around. */ + + ib_logf(IB_LOG_LEVEL_INFO, "Deleting %s", ibd_name); + + os_file_delete_if_exists(innodb_file_data_key, ibd_name); + + char* cfg_name = fil_make_cfg_name(ibd_name); + + os_file_delete_if_exists(innodb_file_data_key, cfg_name); + + mem_free(cfg_name); +} + +/************************************************************************* +Return local hash table informations. */ + +ulint +fil_system_hash_cells(void) +/*=======================*/ +{ + if (fil_system) { + return (fil_system->spaces->n_cells + + fil_system->name_hash->n_cells); + } else { + return 0; + } +} + +ulint +fil_system_hash_nodes(void) +/*=======================*/ +{ + if (fil_system) { + return (UT_LIST_GET_LEN(fil_system->space_list) + * (sizeof(fil_space_t) + MEM_BLOCK_HEADER_SIZE)); + } else { + return 0; + } +} + +/** +Iterate over all the spaces in the space list and fetch the +tablespace names. It will return a copy of the name that must be +freed by the caller using: delete[]. +@return DB_SUCCESS if all OK. */ +UNIV_INTERN +dberr_t +fil_get_space_names( +/*================*/ + space_name_list_t& space_name_list) + /*!< in/out: List to append to */ +{ + fil_space_t* space; + dberr_t err = DB_SUCCESS; + + mutex_enter(&fil_system->mutex); + + for (space = UT_LIST_GET_FIRST(fil_system->space_list); + space != NULL; + space = UT_LIST_GET_NEXT(space_list, space)) { + + if (space->purpose == FIL_TABLESPACE) { + ulint len; + char* name; + + len = strlen(space->name); + name = new(std::nothrow) char[len + 1]; + + if (name == 0) { + /* Caller to free elements allocated so far. */ + err = DB_OUT_OF_MEMORY; + break; + } + + memcpy(name, space->name, len); + name[len] = 0; + + space_name_list.push_back(name); + } + } + + mutex_exit(&fil_system->mutex); + + return(err); +} + +/****************************************************************//** +Generate redo logs for swapping two .ibd files */ +UNIV_INTERN +void +fil_mtr_rename_log( +/*===============*/ + ulint old_space_id, /*!< in: tablespace id of the old + table. */ + const char* old_name, /*!< in: old table name */ + ulint new_space_id, /*!< in: tablespace id of the new + table */ + const char* new_name, /*!< in: new table name */ + const char* tmp_name, /*!< in: temp table name used while + swapping */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + if (old_space_id != TRX_SYS_SPACE) { + fil_op_write_log(MLOG_FILE_RENAME, old_space_id, + 0, 0, old_name, tmp_name, mtr); + } + + if (new_space_id != TRX_SYS_SPACE) { + fil_op_write_log(MLOG_FILE_RENAME, new_space_id, + 0, 0, new_name, old_name, mtr); + } +} + +/************************************************************************* +functions to access is_corrupt flag of fil_space_t*/ + +ibool +fil_space_is_corrupt( +/*=================*/ + ulint space_id) +{ + fil_space_t* space; + ibool ret = FALSE; + + mutex_enter(&fil_system->mutex); + + space = fil_space_get_by_id(space_id); + + if (UNIV_UNLIKELY(space && space->is_corrupt)) { + ret = TRUE; + } + + mutex_exit(&fil_system->mutex); + + return(ret); +} + +void +fil_space_set_corrupt( +/*==================*/ + ulint space_id) +{ + fil_space_t* space; + + mutex_enter(&fil_system->mutex); + + space = fil_space_get_by_id(space_id); + + if (space) { + space->is_corrupt = TRUE; + } + + mutex_exit(&fil_system->mutex); +} + +/****************************************************************//** +Acquire fil_system mutex */ +void +fil_system_enter(void) +/*==================*/ +{ + ut_ad(!mutex_own(&fil_system->mutex)); + mutex_enter(&fil_system->mutex); +} + +/****************************************************************//** +Release fil_system mutex */ +void +fil_system_exit(void) +/*=================*/ +{ + ut_ad(mutex_own(&fil_system->mutex)); + mutex_exit(&fil_system->mutex); +} + +/*******************************************************************//** +Return space name */ +char* +fil_space_name( +/*===========*/ + fil_space_t* space) /*!< in: space */ +{ + return (space->name); +} + +/*******************************************************************//** +Return page type name */ +const char* +fil_get_page_type_name( +/*===================*/ + ulint page_type) /*!< in: FIL_PAGE_TYPE */ +{ + switch(page_type) { + case FIL_PAGE_PAGE_COMPRESSED: + return "PAGE_COMPRESSED"; + case FIL_PAGE_INDEX: + return "INDEX"; + case FIL_PAGE_UNDO_LOG: + return "UNDO LOG"; + case FIL_PAGE_INODE: + return "INODE"; + case FIL_PAGE_IBUF_FREE_LIST: + return "IBUF_FREE_LIST"; + case FIL_PAGE_TYPE_ALLOCATED: + return "ALLOCATED"; + case FIL_PAGE_IBUF_BITMAP: + return "IBUF_BITMAP"; + case FIL_PAGE_TYPE_SYS: + return "SYS"; + case FIL_PAGE_TYPE_TRX_SYS: + return "TRX_SYS"; + case FIL_PAGE_TYPE_FSP_HDR: + return "FSP_HDR"; + case FIL_PAGE_TYPE_XDES: + return "XDES"; + case FIL_PAGE_TYPE_BLOB: + return "BLOB"; + case FIL_PAGE_TYPE_ZBLOB: + return "ZBLOB"; + case FIL_PAGE_TYPE_ZBLOB2: + return "ZBLOB2"; + case FIL_PAGE_TYPE_COMPRESSED: + return "ORACLE PAGE COMPRESSED"; + default: + return "PAGE TYPE CORRUPTED"; + } +} diff --git a/storage/xtradb/fil/fil0pagecompress.cc b/storage/xtradb/fil/fil0pagecompress.cc index e92f35872369a..8c4b2cb13aa54 100644 --- a/storage/xtradb/fil/fil0pagecompress.cc +++ b/storage/xtradb/fil/fil0pagecompress.cc @@ -268,7 +268,8 @@ fil_compress_page( int level = 0; ulint header_len = FIL_PAGE_DATA + FIL_PAGE_COMPRESSED_SIZE; ulint write_size=0; - ulint comp_method = innodb_compression_algorithm; /* Cache to avoid + ulint comp_method = 1;//innodb_compression_algorithm; + /* Cache to avoid change during function execution */ ut_ad(buf); diff --git a/storage/xtradb/fil/fil0pageencryption.cc b/storage/xtradb/fil/fil0pageencryption.cc new file mode 100644 index 0000000000000..111fd7371df46 --- /dev/null +++ b/storage/xtradb/fil/fil0pageencryption.cc @@ -0,0 +1,638 @@ +/***************************************************************************** + +Copyright (C) 2014 eperi GmbH. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*****************************************************************************/ + +/***************************************************************** + @file fil/fil0pageencryption.cc + Implementation for page encryption file spaces. + + Created 08/25/2014 Florin Fugaciu + ***********************************************************************/ + +#include "fil0fil.h" +#include "fil0pageencryption.h" +#include "fsp0pageencryption.h" +#include "my_dbug.h" + +#include "buf0checksum.h" + +#include +#include + +//#define UNIV_PAGEENCRIPTION_DEBUG +//#define CRYPT_FF + +/* calculate a 3 byte checksum to verify decryption. One byte is needed for other things */ +ulint fil_page_encryption_calc_checksum(unsigned char* buf, size_t size) { + ulint checksum = 0; + checksum = ut_fold_binary(buf, size); + checksum = checksum & 0x0FFFFFF0UL; + checksum = checksum >> 8; + checksum = checksum & 0x00FFFFFFUL; + + return checksum; +} + +/****************************************************************//** + For page encrypted pages encrypt the page before actual write + operation. + @return encrypted page to be written*/ +byte* +fil_encrypt_page( +/*==============*/ +ulint space_id, /*!< in: tablespace id of the table. */ +byte* buf, /*!< in: buffer from which to write; in aio + this must be appropriately aligned */ +byte* out_buf, /*!< out: encrypted buffer */ +ulint len, /*!< in: length of input buffer.*/ +ulint encryption_key,/*!< in: encryption key */ +ulint* out_len, /*!< out: actual length of encrypted page */ +ulint mode +) { + + int err = AES_OK; + int key = 0; + ulint header_len = FIL_PAGE_DATA; + ulint remainder = 0; + ulint data_size = 0; + ulint page_size = UNIV_PAGE_SIZE; + ulint orig_page_type = 0; + ulint write_size = 0; + ib_uint64_t flush_lsn = 0; + ib_uint32_t checksum = 0; + ulint offset_ctrl_data = 0; + fil_space_t* space = NULL; + byte* tmp_buf = NULL; + ulint unit_test = 0; + ibool page_compressed = 0 ; + ut_ad(buf);ut_ad(out_buf); + key = encryption_key; + ulint offset = 0; + byte remaining_byte = 0 ; + + unit_test = 0x01 & mode; + + //TODO encryption default key + /* If no encryption key was provided to this table, use system + default key + if (key == 0) { + key = 0; + }*/ + + if (!unit_test) { + ut_ad(fil_space_is_page_encrypted(space_id)); + fil_system_enter(); + space = fil_space_get_by_id(space_id); + fil_system_exit(); + +#ifdef UNIV_DEBUG + fprintf(stderr, + "InnoDB: Note: Preparing for encryption for space %lu name %s len %lu\n", + space_id, fil_space_name(space), len); +#endif /* UNIV_DEBUG */ + + } + + /* data_size -1 bytes will be encrypted at first. + * data_size is the length of the cipher text.*/ + data_size = ((page_size - header_len - FIL_PAGE_DATA_END) / 16) * 16; + + /* following number of bytes are not encrypted at first */ + remainder = (page_size - header_len - FIL_PAGE_DATA_END) - (data_size - 1); + + /* read original page type */ + orig_page_type = mach_read_from_2(buf + FIL_PAGE_TYPE); + + if (orig_page_type == FIL_PAGE_PAGE_COMPRESSED) { + page_compressed = 1; + } + + /* calculate a checksum, can be used to verify decryption */ + checksum = fil_page_encryption_calc_checksum(buf + header_len, page_size - (FIL_PAGE_DATA_END + header_len)); + + const unsigned char rkey[] = {0xbd, 0xe4, 0x72, 0xa2, 0x95, 0x67, 0x5c, 0xa9, + 0x2e, 0x04, 0x67, 0xea, 0xdb, 0xc0,0xe0, 0x23}; + uint8 key_len = 16; + if (!unit_test) { + KeySingleton& keys = KeySingleton::getInstance(); + char* keyString = keys.getKeys(encryption_key)->key; + key_len = strlen(keyString)/2; + my_aes_hexToUint(keyString, (unsigned char*)&rkey, key_len); + } + const unsigned char iv[] = {0x2d, 0x1a, 0xf8, 0xd3, 0x97, 0x4e, 0x0b, 0xd3, 0xef, 0xed, + 0x5a, 0x6f, 0x82, 0x59, 0x4f,0x5e}; + if (!unit_test) { + KeySingleton& keys = KeySingleton::getInstance(); + my_aes_hexToUint(keys.getKeys(encryption_key)->iv, (unsigned char*)&iv, 16); + } + uint8 iv_len = 16; + write_size = data_size; + /* 1st encryption: data_size -1 bytes starting from FIL_PAGE_DATA */ + err = my_aes_encrypt_cbc((char*)buf + header_len, + data_size-1, + (char *)out_buf + header_len, + &write_size, + (const unsigned char *)&rkey, + key_len, + (const unsigned char *)&iv, + iv_len);; + ut_ad(write_size == data_size); + + if (page_compressed) { + /* page compressed pages: only one encryption. 3 bytes remain unencrypted. 2 bytes are appended to the encrypted buffer. + * one byte is later written to the checksum header. + */ + offset = 1; + } + + /* copy remaining bytes to output buffer */ + memcpy(out_buf + header_len + data_size, buf + header_len + data_size - 1, + remainder - offset); + + if (page_compressed) { + remaining_byte = mach_read_from_1(buf + header_len + data_size +1); + } else { + + /* create temporary buffer for 2nd encryption */ + tmp_buf = static_cast(ut_malloc(64)); + + /* 2nd encryption: 63 bytes from out_buf, result length is 64 bytes */ + err = my_aes_encrypt_cbc((char*)out_buf + page_size -FIL_PAGE_DATA_END -62, + 63, + (char*)tmp_buf, + &write_size, + (const unsigned char *)&rkey, + key_len, + (const unsigned char *)&iv, + iv_len); + ut_ad(write_size == 64); + //AES_cbc_encrypt((uchar*)out_buf + page_size -FIL_PAGE_DATA_END -62, tmp_buf, 63, &aeskey, iv, AES_ENCRYPT); + /* copy 62 bytes from 2nd encryption to out_buf, last 2 bytes are copied later to a header field*/ + memcpy(out_buf + page_size - FIL_PAGE_DATA_END -62, tmp_buf, 62); + } + + /* error handling */ + if (err != AES_OK) { + /* If error we leave the actual page as it was */ + + fprintf(stderr, + "InnoDB: Warning: Encryption failed for space %lu name %s len %lu rt %d write %lu\n", + space_id, fil_space_name(space), len, err, data_size); + fflush(stderr); + srv_stats.pages_page_encryption_error.inc(); + *out_len = len; + + /* free temporary buffer */ + if (tmp_buf!=NULL) { + ut_free(tmp_buf); + } + + return (buf); + } + + /* set up the trailer.*/ + memcpy(out_buf + (page_size -FIL_PAGE_DATA_END), + buf + (page_size - FIL_PAGE_DATA_END), FIL_PAGE_DATA_END); + + + /* Set up the page header. Copied from input buffer*/ + memcpy(out_buf, buf, FIL_PAGE_DATA); + + + ulint compressed_size = mach_read_from_2(buf+ FIL_PAGE_DATA); + /* checksum */ + if (!page_compressed) { + /* Set up the checksum. This is only usable to verify decryption */ + mach_write_to_3(out_buf + page_size - FIL_PAGE_DATA_END, checksum); + } else { + ulint pos_checksum = page_size - FIL_PAGE_DATA_END; + if (compressed_size + FIL_PAGE_DATA > pos_checksum) { + pos_checksum = compressed_size + FIL_PAGE_DATA; + if (pos_checksum > page_size - 3) { + // checksum not supported, because no space available + } else { + /* Set up the checksum. This is only usable to verify decryption */ + mach_write_to_3(out_buf + pos_checksum, checksum); + } + } else { + mach_write_to_3(out_buf + page_size - FIL_PAGE_DATA_END, checksum); + } + } + + /* Set up the correct page type */ + mach_write_to_2(out_buf + FIL_PAGE_TYPE, FIL_PAGE_PAGE_ENCRYPTED); + + + offset_ctrl_data = page_size - FIL_PAGE_DATA_END; + + /* checksum fields are used to store original page type, etc. + * checksum check for page encrypted pages is omitted. PAGE_COMPRESSED pages does not seem to have a + * Old-style checksum trailer, therefore this field is only used, if there is space. Payload length is expected as + * two byte value at position FIL_PAGE_DATA */ + + + /* Set up the encryption key. Written to the 1st byte of the checksum header field. This header is currently used to store data. */ + mach_write_to_1(out_buf + page_size - FIL_PAGE_DATA_END - offset_ctrl_data, + key); + + /* store original page type. Written to 2nd and 3rd byte of the checksum header field */ + mach_write_to_2( + out_buf + page_size - FIL_PAGE_DATA_END + 1 - offset_ctrl_data, + orig_page_type); + + /* write remaining bytes to checksum header byte 4 and old style checksum byte 4 */ + if (!page_compressed) { + memcpy(out_buf+ page_size - FIL_PAGE_DATA_END + 3 - offset_ctrl_data, + tmp_buf + 62, 1); + memcpy(out_buf + page_size - FIL_PAGE_DATA_END +3 , tmp_buf + 63, 1); + } else { + /* if page is compressed, only one byte must be placed */ + memset(out_buf+ page_size - FIL_PAGE_DATA_END + 3 - offset_ctrl_data, + remaining_byte, 1); + } + +#ifdef UNIV_DEBUG + /* Verify */ + ut_ad(fil_page_is_encrypted(out_buf)); + +#endif /* UNIV_DEBUG */ + + srv_stats.pages_page_encrypted.inc(); + *out_len = page_size; + + /* free temporary buffer */ + if (tmp_buf!=NULL) { + ut_free(tmp_buf); + } + return (out_buf); +} + +/****************************************************************//** + For page encrypted pages decrypt the page after actual read + operation. + @return decrypted page */ +ulint fil_decrypt_page( +/*================*/ +byte* page_buf, /*!< in: preallocated buffer or NULL */ +byte* buf, /*!< in/out: buffer from which to read; in aio + this must be appropriately aligned */ +ulint len, /*!< in: length of output buffer.*/ +ulint* write_size, /*!< in/out: Actual payload size of the decrypted data. */ +ibool* page_compressed, /*!(ut_malloc(len + 16)); + //in_buf = static_cast(ut_malloc(UNIV_PAGE_SIZE)); + } else { + in_buf = page_buf; + } + + data_size = ((page_size - header_len - FIL_PAGE_DATA_END) / 16) * 16; + remainder = (page_size - header_len - FIL_PAGE_DATA_END) - (data_size - 1); + +#ifdef UNIV_PAGEENCRIPTION_DEBUG + fprintf(stderr, + "InnoDB: Note: Preparing for decrypt for len %lu\n", actuulint compressed_size = mach_read_from_2(buf+ FIL_PAGE_DATA); + if (!page_compressed) { + /* Set up the checksum. This is only usable to verify decryption */ + mach_write_to_3(out_buf + page_size - FIL_PAGE_DATA_END, checksum); + } else { + ulint pos_checksum = page_size - FIL_PAGE_DATA_END; + if (compressed_size + FIL_PAGE_DATA > pos_checksum) { + pos_checksum = compressed_size + FIL_PAGE_DATA; + if (pos_checksum > page_size - 3) { + // checksum not supported, because no space available + } else { + /* Set up the checksum. This is only usable to verify decryption */ + mach_write_to_3(out_buf + pos_checksum, checksum); + } + } + } +al_size); + fflush(stderr); +#endif /* UNIV_PAGEENCRIPTION_DEBUG */ + + tmp_buf= static_cast(ut_malloc(64)); + tmp_page_buf = static_cast(ut_malloc(page_size)); + memset(tmp_page_buf,0, page_size); + + const unsigned char rkey[] = {0xbd, 0xe4, 0x72, 0xa2, 0x95, 0x67, 0x5c, 0xa9, + 0x2e, 0x04, 0x67, 0xea, 0xdb, 0xc0,0xe0, 0x23}; + uint8 key_len = 16; + if (!unit_test) { + KeySingleton& keys = KeySingleton::getInstance(); + char* keyString = keys.getKeys(page_encryption_key)->key; + key_len = strlen(keyString)/2; + my_aes_hexToUint(keyString, (unsigned char*)&rkey, key_len); + } + + + const unsigned char iv[] = {0x2d, 0x1a, 0xf8, 0xd3, 0x97, 0x4e, 0x0b, 0xd3, 0xef, 0xed, + 0x5a, 0x6f, 0x82, 0x59, 0x4f,0x5e}; + if (!unit_test) { + KeySingleton& keys = KeySingleton::getInstance(); + my_aes_hexToUint(keys.getKeys(page_encryption_key)->iv, (unsigned char*)&iv, 16); + } + uint8 iv_len = 16; + + if (!page_compression_flag) { + tmp_page_buf = static_cast(ut_malloc(page_size)); + tmp_buf= static_cast(ut_malloc(64)); + memset(tmp_page_buf,0, page_size); + + + /* 1st decryption: 64 bytes */ + /* 62 bytes from data area and 2 bytes from header are copied to temporary buffer */ + memcpy(tmp_buf, buf + page_size - FIL_PAGE_DATA_END -62, 62); + memcpy(tmp_buf + 62, buf + FIL_PAGE_SPACE_OR_CHKSUM + 3, 1); + memcpy(tmp_buf + 63, buf + page_size - FIL_PAGE_DATA_END +3, 1); + + err = my_aes_decrypt_cbc((const char*) tmp_buf, + 64, + (char *) tmp_page_buf + page_size -FIL_PAGE_DATA_END -62, + &tmp_write_size, + (const unsigned char *)&rkey, + key_len, + (const unsigned char *)&iv, + iv_len + ); + + + /* If decrypt fails it means that page is corrupted or has an unknown key */ + if (err != AES_OK) { + fprintf(stderr, "InnoDB: Corruption: Page is marked as encrypted\n" + "InnoDB: but decrypt failed with error %d.\n" + "InnoDB: size %lu len %lu, key%d\n", err, data_size, + len, page_encryption_key); + fflush(stderr); + if (NULL == page_buf) { + ut_free(in_buf); + } + return err; + } + + ut_ad(tmp_write_size == 63); + + /* copy 1st part from buf to tmp_page_buf */ + /* do not override result of 1st decryption */ + memcpy(tmp_page_buf + FIL_PAGE_DATA, buf + FIL_PAGE_DATA, data_size - 60); + memset(in_buf, 0, page_size); + + + + } else { + tmp_page_buf = buf; + } + + err = my_aes_decrypt_cbc((char*) tmp_page_buf + FIL_PAGE_DATA, + data_size, + (char *) in_buf + FIL_PAGE_DATA, + &tmp_write_size, + (const unsigned char *)&rkey, + key_len, + (const unsigned char *)&iv, + iv_len + ); + ut_ad(tmp_write_size = data_size-1); + + memcpy(in_buf + FIL_PAGE_DATA + data_size -1, tmp_page_buf + page_size - FIL_PAGE_DATA_END - 2, remainder - offset); + if (page_compression_flag) { + /* the last byte was stored in position 4 of the checksum header */ + memcpy(in_buf + FIL_PAGE_DATA + data_size -1 + 2, tmp_page_buf+ FIL_PAGE_SPACE_OR_CHKSUM + 3, 1); + } + + + /* calculate a checksum to verify decryption*/ + checksum = fil_page_encryption_calc_checksum(in_buf + header_len, page_size - (FIL_PAGE_DATA_END + header_len) ); + /* compare with stored checksum */ + ulint compressed_size = mach_read_from_2(in_buf+ FIL_PAGE_DATA); + ibool no_checksum_support = 0; + ulint pos_checksum; + if (!page_compression_flag) { + /* Read the checksum. This is only usable to verify decryption */ + stored_checksum = mach_read_from_3(buf + page_size - FIL_PAGE_DATA_END); + } else { + pos_checksum = page_size - FIL_PAGE_DATA_END; + if (compressed_size + FIL_PAGE_DATA > pos_checksum) { + pos_checksum = compressed_size + FIL_PAGE_DATA; + if (pos_checksum > page_size - 3) { + // checksum not supported, because no space available + no_checksum_support = 1; + } else { + /* Read the checksum. This is only usable to verify decryption */ + stored_checksum = mach_read_from_3(buf + pos_checksum); + } + } else { + /* Read the checksum. This is only usable to verify decryption */ + stored_checksum = mach_read_from_3(buf + page_size - FIL_PAGE_DATA_END); + } + } + + if (!page_compression_flag) { + ut_free(tmp_page_buf); + ut_free(tmp_buf); + } + if (no_checksum_support) { + fprintf(stderr, "InnoDB: decrypting page can not be verified!\n"); + fflush(stderr); + + } else { + if ((stored_checksum != checksum)) { + err = PAGE_ENCRYPTION_WRONG_KEY; + // Need to free temporal buffer if no buffer was given + if (NULL == page_buf) { + ut_free(in_buf); + } + return err; + } + } + + +#ifdef UNIV_PAGEENCRIPTION_DEBUG + fprintf(stderr, "InnoDB: Note: Decryption succeeded for len %lu\n", len); + fflush(stderr); +#endif /* UNIV_PAGEENCRIPTIONulint page_compressed = 0; + _DEBUG */ + + /* copy header */ + memcpy(in_buf, buf, FIL_PAGE_DATA); + + /* copy trailer */ + memcpy(in_buf + page_size - FIL_PAGE_DATA_END, + buf + page_size - FIL_PAGE_DATA_END, FIL_PAGE_DATA_END); + + /* Copy the decrypted page to the buffer pool*/ + memcpy(buf, in_buf, page_size); + + /* setting original page type */ + + mach_write_to_2(buf + FIL_PAGE_TYPE, orig_page_type); + + + /* calc check sums and write to the buffer, if page was not compressed */ + if (!(page_compression_flag )) { + do_check_sum(page_size, buf); + } else { + /* page_compression uses BUF_NO_CHECKSUM_MAGIC as checksum */ + mach_write_to_4(buf + FIL_PAGE_SPACE_OR_CHKSUM, BUF_NO_CHECKSUM_MAGIC); + if (!no_checksum_support) { + /* reset the checksum bytes - if used */ + memset(buf + pos_checksum, 0, 3); + } + + } + + + flags = mach_read_from_4(FSP_HEADER_OFFSET + FSP_SPACE_FLAGS + buf); + + if (!page_compression_flag) { + + page_size = fsp_flags_get_page_size(flags); + + page_num = mach_read_from_4(buf+ 4); + page_encryption_key = FSP_FLAGS_GET_PAGE_ENCRYPTION_KEY(flags); + page_encrypted = FSP_FLAGS_GET_PAGE_ENCRYPTION(flags); + page_compression_flag = FSP_FLAGS_GET_PAGE_COMPRESSION(flags); +// fprintf(stderr,"Page num, page size, key, enc, compr: %lu, %lu, %lu, %lu %lu\n", page_num, page_size, page_encryption_key, page_encrypted, page_compression_flag); + } + // Need to free temporal buffer if no buffer was given + if (NULL == page_buf) { + ut_free(in_buf); + } + + srv_stats.pages_page_decrypted.inc(); + return err; +} + + +void do_check_sum( ulint page_size, byte* buf) { + ib_uint32_t checksum; + /* recalculate check sum - from buf0flu.cc*/ + switch ((srv_checksum_algorithm_t) srv_checksum_algorithm) { + case SRV_CHECKSUM_ALGORITHM_CRC32: + case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32: + checksum = buf_calc_page_crc32(buf); + break; + case SRV_CHECKSUM_ALGORITHM_INNODB: + case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB: + checksum = (ib_uint32_t) buf_calc_page_new_checksum(buf); + break; + case SRV_CHECKSUM_ALGORITHM_NONE: + case SRV_CHECKSUM_ALGORITHM_STRICT_NONE: + + checksum = BUF_NO_CHECKSUM_MAGIC; + break; + /* no default so the compiler will emit a warning if new enum + is added and not handled here */ + } + mach_write_to_4(buf + FIL_PAGE_SPACE_OR_CHKSUM, checksum); + /* We overwrite the first 4 bytes of the end lsn field to store + the old formula checksum. Since it depends also on the field + FIL_PAGE_SPACE_OR_CHKSUM, it has to be calculated after storing the + new formula checksum. */ + if (srv_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_STRICT_INNODB + || srv_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_INNODB) { + checksum = (ib_uint32_t) (buf_calc_page_old_checksum(buf)); + /* In other cases we use the value assigned from above. + If CRC32 is used then it is faster to use that checksum + (calculated above) instead of calculating another one. + We can afford to store something other than + buf_calc_page_old_checksum() or BUF_NO_CHECKSUM_MAGIC in + this field because the file will not be readable by old + versions of MySQL/InnoDB anyway (older than MySQL 5.6.3) */ + } + mach_write_to_4(buf + page_size - FIL_PAGE_END_LSN_OLD_CHKSUM, checksum); +} diff --git a/storage/xtradb/handler/ha_innodb.cc b/storage/xtradb/handler/ha_innodb.cc index 1355c8a22a1a7..fa3026c33ccf6 100644 --- a/storage/xtradb/handler/ha_innodb.cc +++ b/storage/xtradb/handler/ha_innodb.cc @@ -209,6 +209,12 @@ static char* innobase_disable_monitor_counter = NULL; static char* innobase_reset_monitor_counter = NULL; static char* innobase_reset_all_monitor_counter = NULL; +/* Encryption for tables and columns */ +static char* innobase_data_encryption_providername = NULL; +static char* innobase_data_encryption_providerurl = NULL; +static uint innobase_data_encryption_providertype = 0; // 1 == file, 2 == server +static char* innobase_data_encryption_filekey = NULL; + /* The highest file format being used in the database. The value can be set by user, however, it will be adjusted to the newer file format if a table of such format is created/opened. */ @@ -616,6 +622,12 @@ ha_create_table_option innodb_table_option_list[]= HA_TOPTION_NUMBER("PAGE_COMPRESSION_LEVEL", page_compression_level, ULINT_UNDEFINED, 0, 9, 1), /* With this option user can enable atomic writes feature for this table */ HA_TOPTION_ENUM("ATOMIC_WRITES", atomic_writes, "DEFAULT,ON,OFF", 0), + /* With this option the user can enable page encryption for the table */ + HA_TOPTION_BOOL("PAGE_ENCRYPTION", page_encryption, 0), + + /* With this option the user defines the key identifier using for the encryption */ + HA_TOPTION_NUMBER("PAGE_ENCRYPTION_KEY", page_encryption_key, ULINT_UNDEFINED, 1, 255, 1), + HA_TOPTION_END }; @@ -972,6 +984,14 @@ static SHOW_VAR innodb_status_variables[]= { (char*) &export_vars.innodb_page_compressed_trim_op_saved, SHOW_LONGLONG}, {"num_pages_page_decompressed", (char*) &export_vars.innodb_pages_page_decompressed, SHOW_LONGLONG}, + {"num_pages_page_compression_error", + (char*) &export_vars.innodb_pages_page_compression_error, SHOW_LONGLONG}, + {"num_pages_page_encrypted", + (char*) &export_vars.innodb_pages_page_encrypted, SHOW_LONGLONG}, + {"num_pages_page_decrypted", + (char*) &export_vars.innodb_pages_page_decrypted, SHOW_LONGLONG}, + {"num_pages_page_encryption_error", + (char*) &export_vars.innodb_pages_page_encryption_error, SHOW_LONGLONG}, {"have_lz4", (char*) &innodb_have_lz4, SHOW_BOOL}, {"have_lzo", @@ -3395,6 +3415,13 @@ innobase_init( ut_a(DATA_MYSQL_TRUE_VARCHAR == (ulint)MYSQL_TYPE_VARCHAR); + //FF + KeySingleton& keysingleton = KeySingleton::getInstance( + innobase_data_encryption_providername, innobase_data_encryption_providerurl, + innobase_data_encryption_providertype, innobase_data_encryption_filekey); + struct keyentry *entry = keysingleton.getKeys(1); + if(entry) printf("id:%3u \tiv:%s \tkey:%s\n", entry->id, entry->iv, entry->key); + #ifndef DBUG_OFF static const char test_filename[] = "-@"; char test_tablename[sizeof test_filename @@ -3545,6 +3572,14 @@ innobase_init( srv_data_home = (innobase_data_home_dir ? innobase_data_home_dir : default_path); + printf("\ninnobase_data_home_dir = %s,\n innobase_data_encryption_providerurl = %s," + "\n Type = %u, innobase_data_encryption_providername = %s\n\n", + innobase_data_home_dir, innobase_data_encryption_providerurl + ? innobase_data_encryption_providerurl : "ist NULL", + innobase_data_encryption_providertype, innobase_data_encryption_providername + ? innobase_data_encryption_providername : "ist NULL"); +// fflush(stdout); + /* Set default InnoDB data file size to 12 MB and let it be auto-extending. Thus users can use InnoDB in >= 4.0 without having to specify any startup options. */ @@ -11365,6 +11400,8 @@ innobase_table_flags( modified by another thread while the table is being created. */ const ulint default_compression_level = page_zip_level; + const ulint default_encryption_key = 1; + *flags = 0; *flags2 = 0; @@ -11563,9 +11600,11 @@ innobase_table_flags( options->page_compressed, (ulint)options->page_compression_level == ULINT_UNDEFINED ? default_compression_level : options->page_compression_level, - options->atomic_writes); - - if (create_info->options & HA_LEX_CREATE_TMP_TABLE) { + options->atomic_writes, + options->page_encryption, + (ulint)options->page_encryption_key == ULINT_UNDEFINED ? + default_encryption_key : options->page_encryption_key); + if (create_info->options & HA_LEX_CREATE_TMP_TABLE) { *flags2 |= DICT_TF2_TEMPORARY; } @@ -11599,6 +11638,24 @@ ha_innobase::check_table_options( enum row_type row_format = table->s->row_type;; ha_table_option_struct *options= table->s->option_struct; atomic_writes_t awrites = (atomic_writes_t)options->atomic_writes; + if (options->page_encryption) { + if (!use_tablespace) { + push_warning( + thd, Sql_condition::WARN_LEVEL_WARN, + HA_WRONG_CREATE_OPTION, + "InnoDB: PAGE_ENCRYPTION requires" + " innodb_file_per_table."); + return "PAGE_ENCRYPTION"; + } + if (create_info->key_block_size) { + push_warning( + thd, Sql_condition::WARN_LEVEL_WARN, + HA_WRONG_CREATE_OPTION, + "InnoDB: PAGE_ENCRYPTION table can't have" + " key_block_size"); + return "PAGE_ENCRYPTION"; + } + } /* Check page compression requirements */ if (options->page_compressed) { @@ -11663,6 +11720,28 @@ ha_innobase::check_table_options( } } + if ((ulint)options->page_encryption_key != ULINT_UNDEFINED) { + if (options->page_encryption == false) { + push_warning( + thd, Sql_condition::WARN_LEVEL_WARN, + HA_WRONG_CREATE_OPTION, + "InnoDB: PAGE_ENCRYPTION_KEY requires" + " PAGE_ENCRYPTION"); + return "PAGE_ENCRYPTION_KEY"; + } + + if (options->page_encryption_key < 1 || options->page_encryption_key > 255) { + push_warning_printf( + thd, Sql_condition::WARN_LEVEL_WARN, + HA_WRONG_CREATE_OPTION, + "InnoDB: invalid PAGE_ENCRYPTION_KEY = %lu." + " Valid values are [1..255]", + options->page_encryption_key); + return "PAGE_ENCRYPTION_KEY"; + } + } + + /* Check atomic writes requirements */ if (awrites == ATOMIC_WRITES_ON || (awrites == ATOMIC_WRITES_DEFAULT && srv_use_atomic_writes)) { @@ -19897,6 +19976,27 @@ static MYSQL_SYSVAR_BOOL(use_mtflush, srv_use_mtflush, "Use multi-threaded flush. Default FALSE.", NULL, NULL, FALSE); +static MYSQL_SYSVAR_UINT(data_encryption_providertype, innobase_data_encryption_providertype, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Use table or column encryption / decryption. Default is 0 for no use, 1 for keyfile and 2 for keyserver.", + NULL, NULL, 1, 0, 2, 0); + +static MYSQL_SYSVAR_STR(data_encryption_providername, innobase_data_encryption_providername, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Name of keyfile or keyserver.", + NULL, NULL, NULL); + +static MYSQL_SYSVAR_STR(data_encryption_providerurl, innobase_data_encryption_providerurl, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Path or URL for keyfile or keyserver.", + NULL, NULL, NULL); + + static MYSQL_SYSVAR_STR(data_encryption_filekey, innobase_data_encryption_filekey, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Key to encrypt / decrypt the keyfile.", + NULL, NULL, NULL); + + static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(log_block_size), MYSQL_SYSVAR(additional_mem_pool_size), @@ -20106,7 +20206,10 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(compression_algorithm), MYSQL_SYSVAR(mtflush_threads), MYSQL_SYSVAR(use_mtflush), - + MYSQL_SYSVAR(data_encryption_providertype), + MYSQL_SYSVAR(data_encryption_providername), + MYSQL_SYSVAR(data_encryption_providerurl), + MYSQL_SYSVAR(data_encryption_filekey), NULL }; @@ -20116,7 +20219,7 @@ maria_declare_plugin(xtradb) &innobase_storage_engine, innobase_hton_name, plugin_author, - "Percona-XtraDB, Supports transactions, row-level locking, and foreign keys", + "Percona-XtraDB, Supports transactions, row-level locking, foreign keys and encryption for tables and columns", PLUGIN_LICENSE_GPL, innobase_init, /* Plugin Init */ NULL, /* Plugin Deinit */ diff --git a/storage/xtradb/handler/ha_innodb.h b/storage/xtradb/handler/ha_innodb.h index 0c76c286030dc..bf8f09e010dff 100644 --- a/storage/xtradb/handler/ha_innodb.h +++ b/storage/xtradb/handler/ha_innodb.h @@ -26,6 +26,9 @@ this program; if not, write to the Free Software Foundation, Inc., #include "dict0stats.h" +#include "KeySingleton.h" + + /* Structure defines translation table between mysql index and innodb index structures */ struct innodb_idx_translate_t { @@ -58,7 +61,7 @@ typedef struct st_innobase_share { /** Prebuilt structures in an InnoDB table handle used within MySQL */ struct row_prebuilt_t; -/** Engine specific table options are definined using this struct */ +/** Engine specific table options are defined using this struct */ struct ha_table_option_struct { bool page_compressed; /*!< Table is using page compression @@ -71,6 +74,8 @@ struct ha_table_option_struct srv_use_atomic_writes=1. Atomic writes are not used if value OFF.*/ + bool page_encryption; /*!< Flag for an encrypted table */ + int page_encryption_key; /*!< ID of the encryption key */ }; /** The class defining a handle to an Innodb table */ diff --git a/storage/xtradb/include/EncKeys.h b/storage/xtradb/include/EncKeys.h new file mode 100644 index 0000000000000..f1f1bf2d04ed1 --- /dev/null +++ b/storage/xtradb/include/EncKeys.h @@ -0,0 +1,80 @@ +/* Copyright (C) 2014 eperi GmbH. All Rights Reserved. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ + +/******************************************************************//** +@file EncKeys.h +A structure and class to keep keys for encryption/decryption. + +Created 09/15/2014 Florin Fugaciu +***********************************************************************/ + +#ifndef ENCKEYS_H_ +#define ENCKEYS_H_ + +#include +#include + + + + +struct keyentry { + uint id; + char *iv; + char *key; +}; + + +class EncKeys +{ +private: + static const char *strMAGIC, *newLine; + static const int magicSize; + + enum constants { MAX_OFFSETS_IN_PCRE_PATTERNS = 30 }; + enum keyAttributes { KEY_MIN = 1, KEY_MAX = 255, MAX_KEYS = 255, + MAX_IVLEN = 256, MAX_KEYLEN = 512, ivSize16 = 16, keySize32 = 32 }; + enum keyInitType { KEYINITTYPE_FILE = 1, KEYINITTYPE_SERVER = 2 }; + enum errorAttributes { MAX_KEY_LINE_SIZE = 3 * MAX_KEYLEN, MAX_KEY_FILE_SIZE = 1048576 }; + enum errorCodesLine { NO_ERROR_PARSE_OK = 0, NO_ERROR_ISCOMMENT = 10, NO_ERROR_KEY_GREATER_THAN_ASKED = 20, + ERROR_NOINITIALIZEDKEY = 30, ERROR_ID_TOO_BIG = 40, ERROR_WRONG_NUMBER_OF_MATCHES = 50, + ERROR_EQUAL_DOUBLE_KEY = 60, ERROR_UNEQUAL_DOUBLE_KEY = 70 }; + + static const char *errorNoKeyId, *errorInMatches, *errorExceedKeyFileSize, + *errorExceedKeySize, *errorEqualDoubleKey, *errorUnequalDoubleKey, + *errorNoInitializedKey, *errorFalseFileKey, + *errorNotImplemented, *errorOpenFile, *errorReadingFile, *errorFileSize; + + uint countKeys, keyLineInKeyFile; + keyentry keys[MAX_KEYS], *oneKey; + + void printKeyEntry( uint id); + int initKeysThroughFile( const char *name, const char *path, const char *filekey); + bool isComment( const char *line); + char * decryptFile( const char* filename, const char *secret, int *errorCode); + int parseFile( const char* filename, const uint maxKeyId, const char *secret); + int parseLine( const char *line, const uint maxKeyId); + +public: + enum errorCodesFile { NO_ERROR_KEY_FILE_PARSE_OK = 0, ERROR_KEY_FILE_PARSE_NULL = 110, + ERROR_KEY_FILE_TOO_BIG = 120, ERROR_KEY_FILE_EXCEEDS_MAX_NUMBERS_OF_KEYS = 130, + ERROR_OPEN_FILE = 140, ERROR_READING_FILE = 150, ERROR_FALSE_FILE_KEY = 160, + ERROR_KEYINITTYPE_SERVER_NOT_IMPLEMENTED = 170 }; + EncKeys(); + virtual ~EncKeys(); + bool initKeys( const char *name, const char *url, const int initType, const char *filekey); + keyentry *getKeys( int id); +}; + +#endif /* ENCKEYS_H_ */ diff --git a/storage/xtradb/include/KeySingleton.h b/storage/xtradb/include/KeySingleton.h new file mode 100644 index 0000000000000..9fbd95e04aed1 --- /dev/null +++ b/storage/xtradb/include/KeySingleton.h @@ -0,0 +1,55 @@ +/* Copyright (C) 2014 eperi GmbH. All Rights Reserved. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ + +/******************************************************************//** +@file KeySingletonPattern.h +Implementation of single pattern to keep keys for encrypting/decrypting pages. + +Created 09/13/2014 Florin Fugaciu +***********************************************************************/ + + +#ifndef KEYSINGLETON_H_ +#define KEYSINGLETON_H_ + +#include "EncKeys.h" + + +class KeySingleton +{ +private: + static bool instanceInited; + static KeySingleton theInstance; + static EncKeys encKeys; + + // No new instance or object possible + KeySingleton() {} + + // No new instance possible through copy constructor + KeySingleton( const KeySingleton&) {} + + // No new instance possible through copy + KeySingleton & operator = (const KeySingleton&); + +public: + virtual ~KeySingleton() {} + static KeySingleton& getInstance(); + // Init the instance for only one time + static KeySingleton& getInstance(const char *name, const char *url, + const int initType, const char *filekey); + keyentry *getKeys(int id); +}; + +#endif /* KEYSINGLETON_H_ */ diff --git a/storage/xtradb/include/dict0dict.h b/storage/xtradb/include/dict0dict.h index 52ac5eee86b27..84d6aa4fb9b94 100644 --- a/storage/xtradb/include/dict0dict.h +++ b/storage/xtradb/include/dict0dict.h @@ -915,8 +915,10 @@ dict_tf_set( pages */ ulint page_compression_level, /*!< in: table page compression level */ - ulint atomic_writes) /*!< in: table atomic + ulint atomic_writes, /*!< in: table atomic writes option value*/ + bool page_encrypted,/*!< in: table uses page encryption */ + ulint page_encryption_key) /*!< in: page encryption key */ __attribute__((nonnull)); /********************************************************************//** Convert a 32 bit integer table flags to the 32 bit integer that is diff --git a/storage/xtradb/include/dict0dict.ic b/storage/xtradb/include/dict0dict.ic index 2b698dd721848..6a54cf5719063 100644 --- a/storage/xtradb/include/dict0dict.ic +++ b/storage/xtradb/include/dict0dict.ic @@ -543,6 +543,10 @@ dict_tf_is_valid( ulint data_dir = DICT_TF_HAS_DATA_DIR(flags); ulint atomic_writes = DICT_TF_GET_ATOMIC_WRITES(flags); + ulint page_encryption = DICT_TF_GET_PAGE_ENCRYPTION(flags); + ulint page_encryption_key = DICT_TF_GET_PAGE_ENCRYPTION_KEY(flags); + + /* Make sure there are no bits that we do not know about. */ if (unused != 0) { @@ -554,9 +558,11 @@ dict_tf_is_valid( "InnoDB: unused %ld data_dir %ld zip_ssize %ld\n" "InnoDB: page_compression %ld page_compression_level %ld\n" "InnoDB: atomic_writes %ld\n", + "InnoDB: page_encryption %ld page_encryption_key %ld\n", unused, compact, atomic_blobs, unused, data_dir, zip_ssize, - page_compression, page_compression_level, atomic_writes + page_compression, page_compression_level, atomic_writes, + page_encryption, page_encryption_key ); return(false); @@ -693,7 +699,10 @@ dict_sys_tables_type_validate( ulint page_compression_level = DICT_TF_GET_PAGE_COMPRESSION_LEVEL(type); ulint atomic_writes = DICT_TF_GET_ATOMIC_WRITES(type); - ut_a(atomic_writes <= ATOMIC_WRITES_OFF); + ulint page_encryption = DICT_TF_GET_PAGE_ENCRYPTION(type); + ulint page_encryption_key = DICT_TF_GET_PAGE_ENCRYPTION_KEY(type); + + ut_a(atomic_writes >= 0 && atomic_writes <= ATOMIC_WRITES_OFF); /* The low order bit of SYS_TABLES.TYPE is always set to 1. If the format is UNIV_FORMAT_B or higher, this field is the same @@ -856,7 +865,9 @@ dict_tf_set( pages */ ulint page_compression_level, /*!< in: table page compression level */ - ulint atomic_writes) /*!< in: table atomic writes setup */ + ulint atomic_writes, /*!< in: table atomic writes setup */ + bool page_encrypted, /*!< in: table uses page encryption */ + ulint page_encryption_key /*!< in: page encryption key */) { atomic_writes_t awrites = (atomic_writes_t)atomic_writes; @@ -897,6 +908,11 @@ dict_tf_set( *flags |= (atomic_writes << DICT_TF_POS_ATOMIC_WRITES); ut_a(dict_tf_get_atomic_writes(*flags) == awrites); + + if (page_encrypted) { + *flags |= (1 << DICT_TF_POS_PAGE_ENCRYPTION) + | (page_encryption_key << DICT_TF_POS_PAGE_ENCRYPTION_KEY); + } } /********************************************************************//** @@ -919,6 +935,10 @@ dict_tf_to_fsp_flags( ulint fsp_flags; ulint page_compression = DICT_TF_GET_PAGE_COMPRESSION(table_flags); ulint page_compression_level = DICT_TF_GET_PAGE_COMPRESSION_LEVEL(table_flags); + + ulint page_encryption = DICT_TF_GET_PAGE_ENCRYPTION(table_flags); + ulint page_encryption_key = DICT_TF_GET_PAGE_ENCRYPTION_KEY(table_flags); + ulint atomic_writes = DICT_TF_GET_ATOMIC_WRITES(table_flags); DBUG_EXECUTE_IF("dict_tf_to_fsp_flags_failure", @@ -946,6 +966,14 @@ dict_tf_to_fsp_flags( if page compression is used for this table. */ fsp_flags |= FSP_FLAGS_SET_PAGE_COMPRESSION_LEVEL(fsp_flags, page_compression_level); + /* In addition, tablespace flags also contain if the page + encryption is used for this table. */ + fsp_flags |= FSP_FLAGS_SET_PAGE_ENCRYPTION(fsp_flags, page_encryption); + + /* In addition, tablespace flags also contain page encryption key if the page + encryption is used for this table. */ + fsp_flags |= FSP_FLAGS_SET_PAGE_ENCRYPTION_KEY(fsp_flags, page_encryption_key); + /* In addition, tablespace flags also contain flag if atomic writes is used for this table */ fsp_flags |= FSP_FLAGS_SET_ATOMIC_WRITES(fsp_flags, atomic_writes); @@ -987,6 +1015,9 @@ dict_sys_tables_type_to_tf( | DICT_TF_MASK_PAGE_COMPRESSION | DICT_TF_MASK_PAGE_COMPRESSION_LEVEL | DICT_TF_MASK_ATOMIC_WRITES + | DICT_TF_MASK_PAGE_ENCRYPTION + | DICT_TF_MASK_PAGE_ENCRYPTION_KEY + ); return(flags); @@ -1022,7 +1053,9 @@ dict_tf_to_sys_tables_type( | DICT_TF_MASK_DATA_DIR | DICT_TF_MASK_PAGE_COMPRESSION | DICT_TF_MASK_PAGE_COMPRESSION_LEVEL - | DICT_TF_MASK_ATOMIC_WRITES); + | DICT_TF_MASK_ATOMIC_WRITES + | DICT_TF_MASK_PAGE_ENCRYPTION + | DICT_TF_MASK_PAGE_ENCRYPTION_KEY); return(type); } diff --git a/storage/xtradb/include/dict0mem.h b/storage/xtradb/include/dict0mem.h index 8de9206cb81c8..e32012dd7dc60 100644 --- a/storage/xtradb/include/dict0mem.h +++ b/storage/xtradb/include/dict0mem.h @@ -132,6 +132,12 @@ Width of the page compression flag #define DICT_TF_WIDTH_PAGE_COMPRESSION 1 #define DICT_TF_WIDTH_PAGE_COMPRESSION_LEVEL 4 +/** +Width of the page encryption flag +*/ +#define DICT_TF_WIDTH_PAGE_ENCRYPTION 1 +#define DICT_TF_WIDTH_PAGE_ENCRYPTION_KEY 8 + /** Width of atomic writes flag DEFAULT=0, ON = 1, OFF = 2 @@ -145,7 +151,7 @@ DEFAULT=0, ON = 1, OFF = 2 + DICT_TF_WIDTH_DATA_DIR \ + DICT_TF_WIDTH_PAGE_COMPRESSION \ + DICT_TF_WIDTH_PAGE_COMPRESSION_LEVEL \ - + DICT_TF_WIDTH_ATOMIC_WRITES) + + DICT_TF_WIDTH_ATOMIC_WRITES + DICT_TF_WIDTH_PAGE_ENCRYPTION + DICT_TF_WIDTH_PAGE_ENCRYPTION_KEY) /** A mask of all the known/used bits in table flags */ #define DICT_TF_BIT_MASK (~(~0 << DICT_TF_BITS)) @@ -170,9 +176,16 @@ DEFAULT=0, ON = 1, OFF = 2 /** Zero relative shift position of the ATOMIC_WRITES field */ #define DICT_TF_POS_ATOMIC_WRITES (DICT_TF_POS_PAGE_COMPRESSION_LEVEL \ + DICT_TF_WIDTH_PAGE_COMPRESSION_LEVEL) + +/** Zero relative shift position of the PAGE_ENCRYPTION field */ +#define DICT_TF_POS_PAGE_ENCRYPTION (DICT_TF_POS_ATOMIC_WRITES \ + + DICT_TF_WIDTH_ATOMIC_WRITES) +/** Zero relative shift position of the PAGE_ENCRYPTION_KEY field */ +#define DICT_TF_POS_PAGE_ENCRYPTION_KEY (DICT_TF_POS_PAGE_ENCRYPTION \ + + DICT_TF_WIDTH_PAGE_ENCRYPTION) /** Zero relative shift position of the start of the UNUSED bits */ -#define DICT_TF_POS_UNUSED (DICT_TF_POS_ATOMIC_WRITES \ - + DICT_TF_WIDTH_ATOMIC_WRITES) +#define DICT_TF_POS_UNUSED (DICT_TF_POS_PAGE_ENCRYPTION_KEY \ + + DICT_TF_WIDTH_PAGE_ENCRYPTION_KEY) /** Bit mask of the COMPACT field */ #define DICT_TF_MASK_COMPACT \ @@ -202,6 +215,14 @@ DEFAULT=0, ON = 1, OFF = 2 #define DICT_TF_MASK_ATOMIC_WRITES \ ((~(~0 << DICT_TF_WIDTH_ATOMIC_WRITES)) \ << DICT_TF_POS_ATOMIC_WRITES) +/** Bit mask of the PAGE_ENCRYPTION field */ +#define DICT_TF_MASK_PAGE_ENCRYPTION \ + ((~(~0 << DICT_TF_WIDTH_PAGE_ENCRYPTION)) \ + << DICT_TF_POS_PAGE_ENCRYPTION) +/** Bit mask of the PAGE_ENCRYPTION_KEY field */ +#define DICT_TF_MASK_PAGE_ENCRYPTION_KEY \ + ((~(~0 << DICT_TF_WIDTH_PAGE_ENCRYPTION_KEY)) \ + << DICT_TF_POS_PAGE_ENCRYPTION_KEY) /** Return the value of the COMPACT field */ #define DICT_TF_GET_COMPACT(flags) \ @@ -219,6 +240,17 @@ DEFAULT=0, ON = 1, OFF = 2 #define DICT_TF_HAS_DATA_DIR(flags) \ ((flags & DICT_TF_MASK_DATA_DIR) \ >> DICT_TF_POS_DATA_DIR) + +/** Return the contents of the PAGE_ENCRYPTION field */ +#define DICT_TF_GET_PAGE_ENCRYPTION(flags) \ + ((flags & DICT_TF_MASK_PAGE_ENCRYPTION) \ + >> DICT_TF_POS_PAGE_ENCRYPTION) +/** Return the contents of the PAGE_ENCRYPTION KEY field */ +#define DICT_TF_GET_PAGE_ENCRYPTION_KEY(flags) \ + ((flags & DICT_TF_MASK_PAGE_ENCRYPTION_KEY) \ + >> DICT_TF_POS_PAGE_ENCRYPTION_KEY) + + /** Return the contents of the UNUSED bits */ #define DICT_TF_GET_UNUSED(flags) \ (flags >> DICT_TF_POS_UNUSED) diff --git a/storage/xtradb/include/dict0pagecompress.ic b/storage/xtradb/include/dict0pagecompress.ic index 811976434a83b..3ada655d601a8 100644 --- a/storage/xtradb/include/dict0pagecompress.ic +++ b/storage/xtradb/include/dict0pagecompress.ic @@ -42,6 +42,8 @@ dict_tf_verify_flags( ulint page_compression = DICT_TF_GET_PAGE_COMPRESSION(table_flags); ulint page_compression_level = DICT_TF_GET_PAGE_COMPRESSION_LEVEL(table_flags); ulint atomic_writes = DICT_TF_GET_ATOMIC_WRITES(table_flags); + ulint page_encryption = DICT_TF_GET_PAGE_ENCRYPTION(table_flags); + ulint page_encryption_key = DICT_TF_GET_PAGE_ENCRYPTION_KEY(table_flags); ulint post_antelope = FSP_FLAGS_GET_POST_ANTELOPE(fsp_flags); ulint zip_ssize = FSP_FLAGS_GET_ZIP_SSIZE(fsp_flags); ulint fsp_atomic_blobs = FSP_FLAGS_HAS_ATOMIC_BLOBS(fsp_flags); @@ -50,6 +52,9 @@ dict_tf_verify_flags( ulint fsp_page_compression = FSP_FLAGS_GET_PAGE_COMPRESSION(fsp_flags); ulint fsp_page_compression_level = FSP_FLAGS_GET_PAGE_COMPRESSION_LEVEL(fsp_flags); ulint fsp_atomic_writes = FSP_FLAGS_GET_ATOMIC_WRITES(fsp_flags); + ulint fsp_page_encryption = FSP_FLAGS_GET_PAGE_ENCRYPTION(fsp_flags); + ulint fsp_page_encryption_key = FSP_FLAGS_GET_PAGE_ENCRYPTION_KEY(fsp_flags); + DBUG_EXECUTE_IF("dict_tf_verify_flags_failure", return(ULINT_UNDEFINED);); @@ -107,6 +112,26 @@ dict_tf_verify_flags( return (FALSE); } + if (page_encryption != fsp_page_encryption) { + fprintf(stderr, + "InnoDB: Error: table flags has page_encryption %ld" + " in the data dictionary\n" + "InnoDB: but the flags in file has page_encryption %ld\n", + page_encryption, fsp_page_encryption); + + return (FALSE); + } + if (page_encryption_key != fsp_page_encryption_key) { + fprintf(stderr, + "InnoDB: Error: table flags has page_encryption_key %ld" + " in the data dictionary\n" + "InnoDB: but the flags in file has page_encryption_key %ld\n", + page_encryption_key, fsp_page_encryption_key); + + return (FALSE); + } + + return(TRUE); } diff --git a/storage/xtradb/include/fil0fil.h b/storage/xtradb/include/fil0fil.h index 3960eef5d7e09..688e25957f8ff 100644 --- a/storage/xtradb/include/fil0fil.h +++ b/storage/xtradb/include/fil0fil.h @@ -157,6 +157,14 @@ static const ulint FIL_PAGE_COMPRESS_SIZE_V1 = FIL_PAGE_ORIGINAL_SIZE_V1 + 2; #define FIL_PAGE_COMPRESSION_ZLIB 1 /*!< Compressin algorithm ZLIB. */ #define FIL_PAGE_COMPRESSION_LZ4 2 /*!< Compressin algorithm LZ4. */ +#define FIL_PAGE_ENCRYPTION_AES_128 16 /*!< Encryption algorithm AES-128. */ +#define FIL_PAGE_ENCRYPTION_AES_196 24 /*!< Encryption algorithm AES-196. */ +#define FIL_PAGE_ENCRYPTION_AES_256 32 /*!< Encryption algorithm AES-256. */ + +#define FIL_PAGE_ENCRYPTED_SIZE 2 /*!< Number of bytes used to store + actual payload data size on encrypted pages. */ + + /* @} */ /** File page trailer @{ */ #define FIL_PAGE_END_LSN_OLD_CHKSUM 8 /*!< the low 4 bytes of this are used @@ -168,6 +176,7 @@ static const ulint FIL_PAGE_COMPRESS_SIZE_V1 = FIL_PAGE_ORIGINAL_SIZE_V1 + 2; /** File page types (values of FIL_PAGE_TYPE) @{ */ #define FIL_PAGE_PAGE_COMPRESSED 34354 /*!< Page compressed page */ +#define FIL_PAGE_PAGE_ENCRYPTED 34355 /*!< Page encrypted page */ #define FIL_PAGE_INDEX 17855 /*!< B-tree node */ #define FIL_PAGE_UNDO_LOG 2 /*!< Undo log page */ #define FIL_PAGE_INODE 3 /*!< Index node */ diff --git a/storage/xtradb/include/fil0pageencryption.h b/storage/xtradb/include/fil0pageencryption.h new file mode 100644 index 0000000000000..4d50d5088f393 --- /dev/null +++ b/storage/xtradb/include/fil0pageencryption.h @@ -0,0 +1,102 @@ +/***************************************************************************** + +Copyright (C) 2014 eperi GmbH. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +*****************************************************************************/ + +#ifndef fil0pageencryption_h +#define fil0pageencryption_h + +#ifndef EP_UNIT_TEST +#include "fsp0fsp.h" +#include "fsp0pageencryption.h" +#endif + +#define PAGE_ENCRYPTION_WRONG_KEY 1 +#define PAGE_ENCRYPTION_WRONG_PAGE_TYPE 2 +#define PAGE_ENCRYPTION_ERROR 3 +#define PAGE_ENCRYPTION_OK 0 + + + +/******************************************************************//** +@file include/fil0pageencryption.h +Helper functions for encryption/decryption page data on to table space. + +Created 08/25/2014 Florin Fugaciu +***********************************************************************/ + + +/*******************************************************************//** +Returns the page encryption flag of the space, or false if the space +is not encrypted. The tablespace must be cached in the memory cache. +@return true if page encrypted, false if not or space not found */ +ibool +fil_space_is_page_encrypted( +/*=========================*/ + ulint id); /*!< in: space id */ + + +/*******************************************************************//** +Find out whether the page is page encrypted +@return true if page is page encrypted, false if not */ +UNIV_INLINE +ibool +fil_page_is_encrypted( +/*===================*/ + const byte *buf); /*!< in: page */ + + +/****************************************************************//** +For page encrypted pages encrypt the page before actual write +operation. +@return encrypted page to be written*/ +byte* +fil_encrypt_page( +/*==============*/ + ulint space_id, /*!< in: tablespace id of the + table. */ + byte* buf, /*!< in: buffer from which to write; in aio + this must be appropriately aligned */ + byte* out_buf, /*!< out: compressed buffer */ + ulint len, /*!< in: length of input buffer.*/ + ulint compression_level, /*!< in: compression level */ + ulint* out_len, /*!< out: actual length of compressed page */ + ulint mode /*!< in: calling mode. Should be 0. */ + ); + +/****************************************************************//** +For page encrypted pages decrypt the page after actual read +operation. +@return decrypted page */ +ulint +fil_decrypt_page( +/*================*/ + byte* page_buf, /*!< in: preallocated buffer or NULL */ + byte* buf, /*!< out: buffer from which to read; in aio + this must be appropriately aligned */ + ulint len, /*!< in: length of output buffer.*/ + ulint* write_size, /*!< in/out: Actual payload size of the decrypted data. */ + ibool* page_compressed, /*!> FSP_FLAGS_POS_ATOMIC_WRITES) + +#define FSP_FLAGS_GET_PAGE_ENCRYPTION(flags) \ + ((flags & FSP_FLAGS_MASK_PAGE_ENCRYPTION) \ + >> FSP_FLAGS_POS_PAGE_ENCRYPTION) +/** Return the value of the PAGE_ENCRYPTION_KEY field */ +#define FSP_FLAGS_GET_PAGE_ENCRYPTION_KEY(flags) \ + ((flags & FSP_FLAGS_MASK_PAGE_ENCRYPTION_KEY) \ + >> FSP_FLAGS_POS_PAGE_ENCRYPTION_KEY) + + /** Set a PAGE_SSIZE into the correct bits in a given tablespace flags. */ #define FSP_FLAGS_SET_PAGE_SSIZE(flags, ssize) \ @@ -186,6 +223,14 @@ tablespace flags. */ tablespace flags. */ #define FSP_FLAGS_SET_PAGE_COMPRESSION_LEVEL(flags, level) \ (flags | (level << FSP_FLAGS_POS_PAGE_COMPRESSION_LEVEL)) + +/** Set a PAGE_ENCRYPTION into the correct bits in a given tablespace flags. */ +#define FSP_FLAGS_SET_PAGE_ENCRYPTION(flags, encryption) \ + (flags | (encryption << FSP_FLAGS_POS_PAGE_ENCRYPTION)) +/** Set a PAGE_ENCRYPTION_KEY into the correct bits in a given tablespace flags. */ +#define FSP_FLAGS_SET_PAGE_ENCRYPTION_KEY(flags, encryption_key) \ + (flags | (encryption_key << FSP_FLAGS_POS_PAGE_ENCRYPTION_KEY)) + /** Set a ATOMIC_WRITES into the correct bits in a given tablespace flags. */ #define FSP_FLAGS_SET_ATOMIC_WRITES(flags, atomics) \ diff --git a/storage/xtradb/include/fsp0fsp.ic b/storage/xtradb/include/fsp0fsp.ic index ddcb87b0e570d..368b2d73404c3 100644 --- a/storage/xtradb/include/fsp0fsp.ic +++ b/storage/xtradb/include/fsp0fsp.ic @@ -66,6 +66,8 @@ fsp_flags_is_valid( ulint unused = FSP_FLAGS_GET_UNUSED(flags); ulint page_compression = FSP_FLAGS_GET_PAGE_COMPRESSION(flags); ulint page_compression_level = FSP_FLAGS_GET_PAGE_COMPRESSION_LEVEL(flags); + ulint page_encryption = FSP_FLAGS_GET_PAGE_ENCRYPTION(flags); + ulint page_encryption_key = FSP_FLAGS_GET_PAGE_ENCRYPTION_KEY(flags); ulint atomic_writes = FSP_FLAGS_GET_ATOMIC_WRITES(flags); DBUG_EXECUTE_IF("fsp_flags_is_valid_failure", return(false);); diff --git a/storage/xtradb/include/fsp0pageencryption.h b/storage/xtradb/include/fsp0pageencryption.h new file mode 100644 index 0000000000000..1abc38cfb1d14 --- /dev/null +++ b/storage/xtradb/include/fsp0pageencryption.h @@ -0,0 +1,60 @@ +/***************************************************************************** + +/* Copyright (C) 2014 eperi GmbH. All Rights Reserved. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ + +/******************************************************************/ + +/******************************************************************//** +@file include/fsp0pageencryption.h +Helper functions for extracting/storing page encryption information to file space. + +Created 08/28/2014 Florin Fugaciu +***********************************************************************/ + +#ifndef FSP0PAGEENCRYPTION_H_ +#define FSP0PAGEENCRYPTION_H_ + + + +/********************************************************************//** +Determine if the tablespace is page encrypted from dict_table_t::flags. +@return TRUE if page encrypted, FALSE if not page encrypted */ +UNIV_INLINE +ibool +fsp_flags_is_page_encrypted( +/*=========================*/ + ulint flags); /*!< in: tablespace flags */ + + +/********************************************************************//** +Extract the page encryption key from tablespace flags. +A tablespace has only one physical page encryption key +whether that page is encrypted or not. +@return page encryption key of the file-per-table tablespace, +or zero if the table is not encrypted. */ +UNIV_INLINE +ulint +fsp_flags_get_page_encryption_key( +/*=================================*/ + ulint flags); /*!< in: tablespace flags */ + + +#ifndef UNIV_NONINL +#include "fsp0pageencryption.ic" +#endif + + +#endif /* FSP0PAGEENCRYPTION_H_ */ diff --git a/storage/xtradb/include/fsp0pageencryption.ic b/storage/xtradb/include/fsp0pageencryption.ic new file mode 100644 index 0000000000000..7e9b0d9eea455 --- /dev/null +++ b/storage/xtradb/include/fsp0pageencryption.ic @@ -0,0 +1,118 @@ +/***************************************************************************** + + Copyright (C) 2014 eperi GmbH. All Rights Reserved. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ + +/******************************************************************//** +@file include/fsp0pageencryption.ic +Implementation for helper functions for encrypting/decrypting pages +and atomic writes information to file space. + +Created 08/28/2014 Florin Fugaciu +***********************************************************************/ + +#include "fsp0fsp.h" + + + + +/********************************************************************//** +Determine if the tablespace is page encrypted from dict_table_t::flags. +@return TRUE if page encrypted, FALSE if not page encrypted */ +UNIV_INLINE +ibool +fsp_flags_is_page_encrypted( +/*=========================*/ + ulint flags) /*!< in: tablespace flags */ +{ + return(FSP_FLAGS_GET_PAGE_ENCRYPTION(flags)); +} + +/********************************************************************//** +Extract the page encryption key from tablespace flags. +A tablespace has only one physical page encryption key +whether that page is encrypted or not. +@return page encryption key of the file-per-table tablespace, +or zero if the table is not encrypted. */ +UNIV_INLINE +ulint +fsp_flags_get_page_encryption_key( +/*=================================*/ + ulint flags) /*!< in: tablespace flags */ +{ + return(FSP_FLAGS_GET_PAGE_ENCRYPTION_KEY(flags)); +} + + +/*******************************************************************//** +Returns the page encryption flag of the space, or false if the space +is not encrypted. The tablespace must be cached in the memory cache. +@return true if page encrypted, false if not or space not found */ +UNIV_INLINE +ibool +fil_space_is_page_encrypted( +/*=========================*/ + ulint id) /*!< in: space id */ +{ + ulint flags; + + flags = fil_space_get_flags(id); + + if (flags && flags != ULINT_UNDEFINED) { + + return(fsp_flags_is_page_encrypted(flags)); + } + + return(flags); +} + +/*******************************************************************//** +Returns the page encryption key of the space, or 0 if the space +is not encrypted. The tablespace must be cached in the memory cache. +@return page compression level, ULINT_UNDEFINED if space not found */ +UNIV_INLINE +ulint +fil_space_get_page_encryption_key( +/*=================================*/ + ulint id) /*!< in: space id */ +{ + ulint flags; + + flags = fil_space_get_flags(id); + + if (flags && flags != ULINT_UNDEFINED) { + + return(fsp_flags_get_page_encryption_key(flags)); + } + + return(flags); +} + + + +/*******************************************************************//** +Find out whether the page is page encrypted +@return true if page is page encrypted, false if not */ +UNIV_INLINE +ibool +fil_page_is_encrypted( +/*===================*/ + const byte *buf) /*!< in: page */ +{ + //ibool result = FALSE; + ibool result = TRUE; + return(mach_read_from_2(buf+FIL_PAGE_TYPE) == FIL_PAGE_PAGE_ENCRYPTED); + //return(result); +} diff --git a/storage/xtradb/include/os0file.h b/storage/xtradb/include/os0file.h index 76e77799b4372..841e1c447bded 100644 --- a/storage/xtradb/include/os0file.h +++ b/storage/xtradb/include/os0file.h @@ -322,10 +322,10 @@ The wrapper functions have the prefix of "innodb_". */ # define os_aio(type, mode, name, file, buf, offset, \ n, message1, message2, space_id, \ - trx, page_compressed, page_compression_level, write_size) \ + trx, page_compressed, page_compression_level, write_size, page_encryption, page_encryption_key) \ pfs_os_aio_func(type, mode, name, file, buf, offset, \ n, message1, message2, space_id, trx, \ - page_compressed, page_compression_level, write_size, \ + page_compressed, page_compression_level, write_size, page_encryption, page_encryption_key, \ __FILE__, __LINE__) # define os_file_read(file, buf, offset, n, compressed) \ @@ -374,10 +374,10 @@ to original un-instrumented file I/O APIs */ # define os_aio(type, mode, name, file, buf, offset, n, message1, \ message2, space_id, trx, \ - page_compressed, page_compression_level, write_size) \ + page_compressed, page_compression_level, write_size, page_encryption, page_encryption_key) \ os_aio_func(type, mode, name, file, buf, offset, n, \ message1, message2, space_id, trx, \ - page_compressed, page_compression_level, write_size) + page_compressed, page_compression_level, write_size, page_encryption, page_encryption_key) # define os_file_read(file, buf, offset, n, compressed) \ os_file_read_func(file, buf, offset, n, NULL, compressed) @@ -805,6 +805,10 @@ pfs_os_aio_func( operation for this page and if initialized we do not trim again if actual page size does not decrease. */ + ibool page_encryption, /*!< in: is page encryption used + on this file space */ + ulint page_encryption_key, /*!< page encryption + key to be used */ const char* src_file,/*!< in: file name where func invoked */ ulint src_line);/*!< in: line where the func invoked */ /*******************************************************************//** @@ -1187,6 +1191,10 @@ os_aio_func( on this file space */ ulint page_compression_level, /*!< page compression level to be used */ + ibool page_encryption, /*!< in: is page encryption used + on this file space */ + ulint page_encryption_key, /*!< page encryption key + to be used */ ulint* write_size);/*!< in/out: Actual write size initialized after fist successfull trim operation for this page and if diff --git a/storage/xtradb/include/os0file.ic b/storage/xtradb/include/os0file.ic index 61300387e1bf9..f2ff5c79351d5 100644 --- a/storage/xtradb/include/os0file.ic +++ b/storage/xtradb/include/os0file.ic @@ -229,6 +229,11 @@ pfs_os_aio_func( operation for this page and if initialized we do not trim again if actual page size does not decrease. */ + ibool page_encryption, /*!< in: is page encryption used + on this file space */ + ulint page_encryption_key, /*!< page encryption + key to be used */ + const char* src_file,/*!< in: file name where func invoked */ ulint src_line)/*!< in: line where the func invoked */ { @@ -245,7 +250,8 @@ pfs_os_aio_func( result = os_aio_func(type, mode, name, file, buf, offset, n, message1, message2, space_id, trx, - page_compression, page_compression_level, write_size); + page_compression, page_compression_level, + page_encryption, page_encryption_key, write_size); register_pfs_file_io_end(locker, n); diff --git a/storage/xtradb/include/srv0mon.h b/storage/xtradb/include/srv0mon.h index 8e6975ed68fe4..8f8c7317cba23 100644 --- a/storage/xtradb/include/srv0mon.h +++ b/storage/xtradb/include/srv0mon.h @@ -318,6 +318,11 @@ enum monitor_id_t { MONITOR_OVLD_PAGES_PAGE_DECOMPRESSED, MONITOR_OVLD_PAGES_PAGE_COMPRESSION_ERROR, + /* New monitor variables for page encryption */ + MONITOR_OVLD_PAGES_PAGE_ENCRYPTED, + MONITOR_OVLD_PAGES_PAGE_DECRYPTED, + MONITOR_OVLD_PAGES_PAGE_ENCRYPTION_ERROR, + /* Index related counters */ MONITOR_MODULE_INDEX, MONITOR_INDEX_SPLIT, diff --git a/storage/xtradb/include/srv0srv.h b/storage/xtradb/include/srv0srv.h index aef04d003d5e5..832c84bf19165 100644 --- a/storage/xtradb/include/srv0srv.h +++ b/storage/xtradb/include/srv0srv.h @@ -124,6 +124,13 @@ struct srv_stats_t { /* Number of page compression errors */ ulint_ctr_64_t pages_page_compression_error; + /* Number of pages encrypted with page encryption */ + ulint_ctr_64_t pages_page_encrypted; + /* Number of pages decrypted with page encryption */ + ulint_ctr_64_t pages_page_decrypted; + /* Number of page encryption errors */ + ulint_ctr_64_t pages_page_encryption_error; + /** Number of data read in total (in bytes) */ ulint_ctr_1_t data_read; @@ -1156,6 +1163,12 @@ struct export_var_t{ compression */ ib_int64_t innodb_pages_page_compression_error;/*!< Number of page compression errors */ + ib_int64_t innodb_pages_page_encrypted;/*!< Number of pages + encrypted by page encryption */ + ib_int64_t innodb_pages_page_decrypted;/*!< Number of pages + decrypted by page encryption */ + ib_int64_t innodb_pages_page_encryption_error;/*!< Number of page + encryption errors */ }; /** Thread slot in the thread table. */ diff --git a/storage/xtradb/os/os0file.cc b/storage/xtradb/os/os0file.cc index d67573d14aa09..0a03adaa31156 100644 --- a/storage/xtradb/os/os0file.cc +++ b/storage/xtradb/os/os0file.cc @@ -43,7 +43,10 @@ Created 10/21/1995 Heikki Tuuri #include "srv0srv.h" #include "srv0start.h" #include "fil0fil.h" +#include "fsp0fsp.h" #include "fil0pagecompress.h" +#include "fsp0pageencryption.h" +#include "fil0pageencryption.h" #include "buf0buf.h" #include "btr0types.h" #include "trx0trx.h" @@ -219,9 +222,18 @@ struct os_aio_slot_t{ freed after the write has been completed */ + byte* page_encryption_page; /*!< Memory allocated for + page encrypted page and + freed after the write + has been completed */ + + ibool page_compression; ulint page_compression_level; + ibool page_encryption; + ulint page_encryption_key; + ulint* write_size; /*!< Actual write size initialized after fist successfull trim operation for this page and if @@ -232,7 +244,13 @@ struct os_aio_slot_t{ page compressed pages, do not free this */ + byte* page_buf2; /*!< Actual page buffer for + page encrypted pages, do not + free this */ + + ibool page_compress_success; + ibool page_encryption_success; #ifdef LINUX_NATIVE_AIO struct iocb control; /* Linux control block for aio */ @@ -378,6 +396,13 @@ os_slot_alloc_lzo_mem( os_aio_slot_t* slot); /*!< in: slot structure */ #endif +/**********************************************************************//** +Allocate memory for temporal buffer used for page encryption. This +buffer is freed later. */ +UNIV_INTERN +void +os_slot_alloc_page_buf2( +os_aio_slot_t* slot); /*!< in: slot structure */ /****************************************************************//** Does error handling when a file operation fails. @return TRUE if we should retry the operation */ @@ -488,19 +513,19 @@ os_get_os_version(void) /* Windows : Handling synchronous IO on files opened asynchronously. -If file is opened for asynchronous IO (FILE_FLAG_OVERLAPPED) and also bound to +If file is opened for asynchronous IO (FILE_FLAG_OVERLAPPED) and also bound to a completion port, then every IO on this file would normally be enqueued to the completion port. Sometimes however we would like to do a synchronous IO. This is possible if we initialitze have overlapped.hEvent with a valid event and set its lowest order bit to 1 (see MSDN ReadFile and WriteFile description for more info) -We'll create this special event once for each thread and store in thread local +We'll create this special event once for each thread and store in thread local storage. */ /***********************************************************************//** -Initialize tls index.for event handle used for synchronized IO on files that +Initialize tls index.for event handle used for synchronized IO on files that might be opened with FILE_FLAG_OVERLAPPED. */ static void win_init_syncio_event() @@ -3122,7 +3147,11 @@ os_file_read_func( ret = os_file_pread(file, buf, n, offset, trx); if ((ulint) ret == n) { - + if (fil_page_is_encrypted((byte *)buf)) { + if (fil_decrypt_page(NULL, (byte *)buf, n, NULL, &compressed, 0)!=PAGE_ENCRYPTION_OK) {; + return FALSE; + } + } /* Note that InnoDB writes files that are not formated as file spaces and they do not have FIL_PAGE_TYPE field, thus we must use here information is the actual @@ -3131,6 +3160,7 @@ os_file_read_func( fil_decompress_page(NULL, (byte *)buf, n, NULL); } + return(TRUE); } @@ -3242,6 +3272,12 @@ os_file_read_no_error_handling_func( if ((ulint) ret == n) { + + if (fil_page_is_encrypted((byte *)buf)) { + if (fil_decrypt_page(NULL, (byte *)buf, n, NULL, &compressed, 0)!=PAGE_ENCRYPTION_OK) return (FALSE); + } + + /* Note that InnoDB writes files that are not formated as file spaces and they do not have FIL_PAGE_TYPE field, thus we must use here information is the actual @@ -3250,6 +3286,9 @@ os_file_read_no_error_handling_func( fil_decompress_page(NULL, (byte *)buf, n, NULL); } + + + return(TRUE); } #endif /* __WIN__ */ @@ -4262,6 +4301,15 @@ os_aio_array_free( } } + for (i = 0; i < array->n_slots; i++) { + os_aio_slot_t* slot = os_aio_array_get_nth_slot(array, i); + if (slot->page_encryption_page) { + ut_free(slot->page_encryption_page); + slot->page_encryption_page = NULL; + } + } + + ut_free(array->slots); ut_free(array); @@ -4611,6 +4659,10 @@ os_aio_array_reserve_slot( on this file space */ ulint page_compression_level, /*!< page compression level to be used */ + ibool page_encryption, /*!< in: is page encryption used + on this file space */ + ulint page_encryption_key, /*!< page encryption key + to be used */ ulint* write_size)/*!< in/out: Actual write size initialized after fist successfull trim operation for this page and if @@ -4708,9 +4760,13 @@ os_aio_array_reserve_slot( slot->space_id = space_id; slot->page_compress_success = FALSE; + slot->page_encryption_success = FALSE; + slot->write_size = write_size; slot->page_compression_level = page_compression_level; slot->page_compression = page_compression; + slot->page_encryption_key = page_encryption_key; + slot->page_encryption = page_encryption; /* If the space is page compressed and this is write operation then we compress the page */ @@ -4756,6 +4812,37 @@ os_aio_array_reserve_slot( /* Take array mutex back */ os_mutex_enter(array->mutex); + } //CMD + /* If the space is page encryption and this is write operation + then we encrypt the page */ + if (message1 && type == OS_FILE_WRITE && page_encryption ) { + ulint real_len = len; + byte* tmp = NULL; + + /* Release the array mutex while encrypting */ + os_mutex_exit(array->mutex); + + // We allocate memory for page encrypted buffer if and only + // if it is not yet allocated. + if (slot->page_buf2 == NULL) { + os_slot_alloc_page_buf2(slot); + } + + ut_ad(slot->page_buf2); + tmp = fil_encrypt_page(fil_node_get_space_id(slot->message1), (byte *)buf, slot->page_buf2, len, page_encryption_key, &real_len, 0); + + /* If encryption succeeded, set up the length and buffer */ + if (tmp != buf) { + len = real_len; + buf = slot->page_buf2; + slot->len = real_len; + slot->page_encryption_success = TRUE; + } else { + slot->page_encryption_success = FALSE; + } + + /* Take array mutex back */ + os_mutex_enter(array->mutex); } #ifdef WIN_ASYNC_IO @@ -5038,6 +5125,10 @@ os_aio_func( on this file space */ ulint page_compression_level, /*!< page compression level to be used */ + ibool page_encryption, /*!< in: is page encryption used + on this file space */ + ulint page_encryption_key, /*!< page encryption key + to be used */ ulint* write_size)/*!< in/out: Actual write size initialized after fist successfull trim operation for this page and if @@ -5134,7 +5225,8 @@ os_aio_func( } slot = os_aio_array_reserve_slot(type, array, message1, message2, file, name, buf, offset, n, space_id, - page_compression, page_compression_level, write_size); + page_compression, page_compression_level, + page_encryption, page_encryption_key, write_size); if (type == OS_FILE_READ) { if (srv_use_native_aio) { os_n_file_reads++; @@ -5245,7 +5337,7 @@ os_aio_windows_handle( HANDLE port = READ_SEGMENT(segment)? read_completion_port : completion_port; for(;;) { - ret = GetQueuedCompletionStatus(port, &len, &key, + ret = GetQueuedCompletionStatus(port, &len, &key, (OVERLAPPED **)&slot, INFINITE); /* If shutdown key was received, repost the shutdown message and exit */ @@ -5260,19 +5352,19 @@ os_aio_windows_handle( if(WRITE_SEGMENT(segment)&& slot->type == OS_FILE_READ) { /* - Redirect read completions to the dedicated completion port + Redirect read completions to the dedicated completion port and thread. We need to split read and write threads. If we do not - do that, and just allow all io threads process all IO, it is possible + do that, and just allow all io threads process all IO, it is possible to get stuck in a deadlock in buffer pool code, - Currently, the problem is solved this way - "write io" threads + Currently, the problem is solved this way - "write io" threads always get all completion notifications, from both async reads and writes. Write completion is handled in the same thread that gets it. Read completion is forwarded via PostQueueCompletionStatus()) to the second completion port dedicated solely to reads. One of the "read io" threads waiting on this port will finally handle the IO. - Forwarding IO completion this way costs a context switch , and this + Forwarding IO completion this way costs a context switch , and this seems tolerable since asynchronous reads are by far less frequent. */ ut_a(PostQueuedCompletionStatus(read_completion_port, len, key, @@ -5469,6 +5561,33 @@ os_aio_linux_collect( /* We have not overstepped to next segment. */ ut_a(slot->pos < end_pos); + + + /* page encryption */ + if (slot->message1 && slot->page_encryption) { + if (slot->page_buf2==NULL) { + os_slot_alloc_page_buf2(slot); + } + + ut_ad(slot->page_buf2); + + if (slot->type == OS_FILE_READ) { + if (fil_page_is_encrypted(slot->buf)) { + fil_decrypt_page(slot->page_buf2, slot->buf, slot->len, slot->write_size, NULL, 0); + } + } else { + if (slot->page_encryption_success && + fil_page_is_encrypted(slot->page_buf2)) { + ut_ad(slot->page_encryption_page); + if (srv_use_trim && os_fallocate_failed == FALSE) { + // Deallocate unused blocks from file system ??? + //os_file_trim(slot->file, slot, slot->len); + } + } + } + } + + /* If the table is page compressed and this is read, we decompress before we annouce the read is complete. For writes, we free the compressed page. */ @@ -5498,6 +5617,7 @@ os_aio_linux_collect( } } + /* Mark this request as completed. The error handling will be done in the calling function. */ os_mutex_enter(array->mutex); @@ -6515,6 +6635,24 @@ os_file_trim( } +/**********************************************************************//** +Allocate memory for temporal buffer used for page encryption. This +buffer is freed later. */ +UNIV_INTERN +void +os_slot_alloc_page_buf2( +/*===================*/ + os_aio_slot_t* slot) /*!< in: slot structure */ +{ + byte* cbuf2; + byte* cbuf; + + cbuf2 = static_cast(ut_malloc(UNIV_PAGE_SIZE*2)); + cbuf = static_cast(ut_align(cbuf2, UNIV_PAGE_SIZE)); + slot->page_encryption_page = static_cast(cbuf2); + slot->page_buf2 = static_cast(cbuf); +} + /**********************************************************************//** Allocate memory for temporal buffer used for page compression. This buffer is freed later. */ @@ -6551,3 +6689,4 @@ os_slot_alloc_lzo_mem( ut_a(slot->lzo_mem != NULL); } #endif + diff --git a/storage/xtradb/os/os0file.cc.orig b/storage/xtradb/os/os0file.cc.orig new file mode 100644 index 0000000000000..c954c16a6fb10 --- /dev/null +++ b/storage/xtradb/os/os0file.cc.orig @@ -0,0 +1,6683 @@ +/*********************************************************************** + +Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2009, Percona Inc. +Copyright (c) 2013, 2014, SkySQL Ab. All Rights Reserved. + +Portions of this file contain modifications contributed and copyrighted +by Percona Inc.. Those modifications are +gratefully acknowledged and are described briefly in the InnoDB +documentation. The contributions by Percona Inc. are incorporated with +their permission, and subject to the conditions contained in the file +COPYING.Percona. + +This program is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the +Free Software Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +***********************************************************************/ + +/**************************************************//** +@file os/os0file.cc +The interface to the operating system file i/o primitives + +Created 10/21/1995 Heikki Tuuri +*******************************************************/ + +#include "os0file.h" + +#ifdef UNIV_NONINL +#include "os0file.ic" +#endif +#include "ha_prototypes.h" +#include "ut0mem.h" +#include "srv0srv.h" +#include "srv0start.h" +#include "fil0fil.h" +#include "fsp0fsp.h" +#include "fil0pagecompress.h" +#include "fsp0pageencryption.h" +#include "fil0pageencryption.h" +#include "buf0buf.h" +#include "btr0types.h" +#include "trx0trx.h" +#include "srv0mon.h" +#include "srv0srv.h" +#ifdef HAVE_POSIX_FALLOCATE +#include "fcntl.h" +#endif +#ifndef UNIV_HOTBACKUP +# include "os0sync.h" +# include "os0thread.h" +#else /* !UNIV_HOTBACKUP */ +# ifdef __WIN__ +/* Add includes for the _stat() call to compile on Windows */ +# include +# include +# include +# endif /* __WIN__ */ +#endif /* !UNIV_HOTBACKUP */ + +#if defined(LINUX_NATIVE_AIO) +#include +#endif + +#ifdef _WIN32 +#define IOCP_SHUTDOWN_KEY (ULONG_PTR)-1 +#endif + +#if defined(UNIV_LINUX) && defined(HAVE_SYS_IOCTL_H) +# include +# ifndef DFS_IOCTL_ATOMIC_WRITE_SET +# define DFS_IOCTL_ATOMIC_WRITE_SET _IOW(0x95, 2, uint) +# endif +#endif + +#ifdef HAVE_LZO +#include "lzo/lzo1x.h" +#endif + +/** Insert buffer segment id */ +static const ulint IO_IBUF_SEGMENT = 0; + +/** Log segment id */ +static const ulint IO_LOG_SEGMENT = 1; + +/* This specifies the file permissions InnoDB uses when it creates files in +Unix; the value of os_innodb_umask is initialized in ha_innodb.cc to +my_umask */ + +#ifndef __WIN__ +/** Umask for creating files */ +UNIV_INTERN ulint os_innodb_umask = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP; +#else +/** Umask for creating files */ +UNIV_INTERN ulint os_innodb_umask = 0; +#endif /* __WIN__ */ + +#ifndef UNIV_HOTBACKUP +/* We use these mutexes to protect lseek + file i/o operation, if the +OS does not provide an atomic pread or pwrite, or similar */ +#define OS_FILE_N_SEEK_MUTEXES 16 +UNIV_INTERN os_ib_mutex_t os_file_seek_mutexes[OS_FILE_N_SEEK_MUTEXES]; + +/* In simulated aio, merge at most this many consecutive i/os */ +#define OS_AIO_MERGE_N_CONSECUTIVE 64 + +#ifdef WITH_INNODB_DISALLOW_WRITES +#define WAIT_ALLOW_WRITES() os_event_wait(srv_allow_writes_event) +#else +#define WAIT_ALLOW_WRITES() do { } while (0) +#endif /* WITH_INNODB_DISALLOW_WRITES */ + +/********************************************************************** + +InnoDB AIO Implementation: +========================= + +We support native AIO for windows and linux. For rest of the platforms +we simulate AIO by special io-threads servicing the IO-requests. + +Simulated AIO: +============== + +In platforms where we 'simulate' AIO following is a rough explanation +of the high level design. +There are four io-threads (for ibuf, log, read, write). +All synchronous IO requests are serviced by the calling thread using +os_file_write/os_file_read. The Asynchronous requests are queued up +in an array (there are four such arrays) by the calling thread. +Later these requests are picked up by the io-thread and are serviced +synchronously. + +Windows native AIO: +================== + +If srv_use_native_aio is not set then windows follow the same +code as simulated AIO. If the flag is set then native AIO interface +is used. On windows, one of the limitation is that if a file is opened +for AIO no synchronous IO can be done on it. Therefore we have an +extra fifth array to queue up synchronous IO requests. +There are innodb_file_io_threads helper threads. These threads work +on the four arrays mentioned above in Simulated AIO. No thread is +required for the sync array. +If a synchronous IO request is made, it is first queued in the sync +array. Then the calling thread itself waits on the request, thus +making the call synchronous. +If an AIO request is made the calling thread not only queues it in the +array but also submits the requests. The helper thread then collects +the completed IO request and calls completion routine on it. + +Linux native AIO: +================= + +If we have libaio installed on the system and innodb_use_native_aio +is set to TRUE we follow the code path of native AIO, otherwise we +do simulated AIO. +There are innodb_file_io_threads helper threads. These threads work +on the four arrays mentioned above in Simulated AIO. +If a synchronous IO request is made, it is handled by calling +os_file_write/os_file_read. +If an AIO request is made the calling thread not only queues it in the +array but also submits the requests. The helper thread then collects +the completed IO request and calls completion routine on it. + +**********************************************************************/ + +/** Flag: enable debug printout for asynchronous i/o */ +UNIV_INTERN ibool os_aio_print_debug = FALSE; + +#ifdef UNIV_PFS_IO +/* Keys to register InnoDB I/O with performance schema */ +UNIV_INTERN mysql_pfs_key_t innodb_file_data_key; +UNIV_INTERN mysql_pfs_key_t innodb_file_log_key; +UNIV_INTERN mysql_pfs_key_t innodb_file_temp_key; +UNIV_INTERN mysql_pfs_key_t innodb_file_bmp_key; +#endif /* UNIV_PFS_IO */ + +/** The asynchronous i/o array slot structure */ +struct os_aio_slot_t{ +#ifdef WIN_ASYNC_IO + OVERLAPPED control; /*!< Windows control block for the + aio request, MUST be first element in the structure*/ + void *arr; /*!< Array this slot belongs to*/ +#endif + + ibool is_read; /*!< TRUE if a read operation */ + ulint pos; /*!< index of the slot in the aio + array */ + ibool reserved; /*!< TRUE if this slot is reserved */ + time_t reservation_time;/*!< time when reserved */ + ulint len; /*!< length of the block to read or + write */ + byte* buf; /*!< buffer used in i/o */ + ulint type; /*!< OS_FILE_READ or OS_FILE_WRITE */ + os_offset_t offset; /*!< file offset in bytes */ + os_file_t file; /*!< file where to read or write */ + const char* name; /*!< file name or path */ + ibool io_already_done;/*!< used only in simulated aio: + TRUE if the physical i/o already + made and only the slot message + needs to be passed to the caller + of os_aio_simulated_handle */ + ulint space_id; + fil_node_t* message1; /*!< message which is given by the */ + void* message2; /*!< the requester of an aio operation + and which can be used to identify + which pending aio operation was + completed */ + ulint bitmap; + + byte* page_compression_page; /*!< Memory allocated for + page compressed page and + freed after the write + has been completed */ + + byte* page_encryption_page; /*!< Memory allocated for + page encrypted page and + freed after the write + has been completed */ + + + ibool page_compression; + ulint page_compression_level; + + ibool page_encryption; + ulint page_encryption_key; + + ulint* write_size; /*!< Actual write size initialized + after fist successfull trim + operation for this page and if + initialized we do not trim again if + actual page size does not decrease. */ + + byte* page_buf; /*!< Actual page buffer for + page compressed pages, do not + free this */ + + byte* page_buf2; /*!< Actual page buffer for + page encrypted pages, do not + free this */ + + + ibool page_compress_success; + ibool page_encryption_success; + +#ifdef LINUX_NATIVE_AIO + struct iocb control; /* Linux control block for aio */ + int n_bytes; /* bytes written/read. */ + int ret; /* AIO return code */ +#endif /* WIN_ASYNC_IO */ + byte *lzo_mem; /* Temporal memory used by LZO */ +}; + +/** The asynchronous i/o array structure */ +struct os_aio_array_t{ + os_ib_mutex_t mutex; /*!< the mutex protecting the aio array */ + os_event_t not_full; + /*!< The event which is set to the + signaled state when there is space in + the aio outside the ibuf segment */ + os_event_t is_empty; + /*!< The event which is set to the + signaled state when there are no + pending i/os in this array */ + ulint n_slots;/*!< Total number of slots in the aio + array. This must be divisible by + n_threads. */ + ulint n_segments; + /*!< Number of segments in the aio + array of pending aio requests. A + thread can wait separately for any one + of the segments. */ + ulint cur_seg;/*!< We reserve IO requests in round + robin fashion to different segments. + This points to the segment that is to + be used to service next IO request. */ + ulint n_reserved; + /*!< Number of reserved slots in the + aio array outside the ibuf segment */ + os_aio_slot_t* slots; /*!< Pointer to the slots in the array */ + +#if defined(LINUX_NATIVE_AIO) + io_context_t* aio_ctx; + /* completion queue for IO. There is + one such queue per segment. Each thread + will work on one ctx exclusively. */ + struct io_event* aio_events; + /* The array to collect completed IOs. + There is one such event for each + possible pending IO. The size of the + array is equal to n_slots. */ +#endif /* LINUX_NATIV_AIO */ +}; + +#if defined(LINUX_NATIVE_AIO) +/** timeout for each io_getevents() call = 500ms. */ +#define OS_AIO_REAP_TIMEOUT (500000000UL) + +/** time to sleep, in microseconds if io_setup() returns EAGAIN. */ +#define OS_AIO_IO_SETUP_RETRY_SLEEP (500000UL) + +/** number of attempts before giving up on io_setup(). */ +#define OS_AIO_IO_SETUP_RETRY_ATTEMPTS 5 +#endif + +/** Array of events used in simulated aio */ +static os_event_t* os_aio_segment_wait_events = NULL; + +/** The aio arrays for non-ibuf i/o and ibuf i/o, as well as sync aio. These +are NULL when the module has not yet been initialized. @{ */ +static os_aio_array_t* os_aio_read_array = NULL; /*!< Reads */ +static os_aio_array_t* os_aio_write_array = NULL; /*!< Writes */ +static os_aio_array_t* os_aio_ibuf_array = NULL; /*!< Insert buffer */ +static os_aio_array_t* os_aio_log_array = NULL; /*!< Redo log */ +static os_aio_array_t* os_aio_sync_array = NULL; /*!< Synchronous I/O */ +/* @} */ + +/** Number of asynchronous I/O segments. Set by os_aio_init(). */ +static ulint os_aio_n_segments = ULINT_UNDEFINED; + +/** If the following is TRUE, read i/o handler threads try to +wait until a batch of new read requests have been posted */ +static ibool os_aio_recommend_sleep_for_read_threads = FALSE; +#endif /* !UNIV_HOTBACKUP */ + +UNIV_INTERN ulint os_n_file_reads = 0; +UNIV_INTERN ulint os_bytes_read_since_printout = 0; +UNIV_INTERN ulint os_n_file_writes = 0; +UNIV_INTERN ulint os_n_fsyncs = 0; +UNIV_INTERN ulint os_n_file_reads_old = 0; +UNIV_INTERN ulint os_n_file_writes_old = 0; +UNIV_INTERN ulint os_n_fsyncs_old = 0; +UNIV_INTERN time_t os_last_printout; + +UNIV_INTERN ibool os_has_said_disk_full = FALSE; + +#if !defined(UNIV_HOTBACKUP) \ + && (!defined(HAVE_ATOMIC_BUILTINS) || UNIV_WORD_SIZE < 8) +/** The mutex protecting the following counts of pending I/O operations */ +static os_ib_mutex_t os_file_count_mutex; +#endif /* !UNIV_HOTBACKUP && (!HAVE_ATOMIC_BUILTINS || UNIV_WORD_SIZE < 8) */ + +/** Number of pending os_file_pread() operations */ +UNIV_INTERN ulint os_file_n_pending_preads = 0; +/** Number of pending os_file_pwrite() operations */ +UNIV_INTERN ulint os_file_n_pending_pwrites = 0; +/** Number of pending write operations */ +UNIV_INTERN ulint os_n_pending_writes = 0; +/** Number of pending read operations */ +UNIV_INTERN ulint os_n_pending_reads = 0; + +/** After first fallocate failure we will disable os_file_trim */ +UNIV_INTERN ibool os_fallocate_failed = FALSE; + +/**********************************************************************//** +Directly manipulate the allocated disk space by deallocating for the file referred to +by fd for the byte range starting at offset and continuing for len bytes. +Within the specified range, partial file system blocks are zeroed, and whole +file system blocks are removed from the file. After a successful call, +subsequent reads from this range will return zeroes. +@return true if success, false if error */ +UNIV_INTERN +ibool +os_file_trim( +/*=========*/ + os_file_t file, /*!< in: file to be trimmed */ + os_aio_slot_t* slot, /*!< in: slot structure */ + ulint len); /*!< in: length of area */ + +/**********************************************************************//** +Allocate memory for temporal buffer used for page compression. This +buffer is freed later. */ +UNIV_INTERN +void +os_slot_alloc_page_buf( +/*===================*/ + os_aio_slot_t* slot); /*!< in: slot structure */ + +#ifdef HAVE_LZO +/**********************************************************************//** +Allocate memory for temporal memory used for page compression when +LZO compression method is used */ +UNIV_INTERN +void +os_slot_alloc_lzo_mem( +/*===================*/ + os_aio_slot_t* slot); /*!< in: slot structure */ +#endif + +/**********************************************************************//** +Allocate memory for temporal buffer used for page encryption. This +buffer is freed later. */ +UNIV_INTERN +void +os_slot_alloc_page_buf2( +os_aio_slot_t* slot); /*!< in: slot structure */ +/****************************************************************//** +Does error handling when a file operation fails. +@return TRUE if we should retry the operation */ +ibool +os_file_handle_error_no_exit( +/*=========================*/ + const char* name, /*!< in: name of a file or NULL */ + const char* operation, /*!< in: operation */ + ibool on_error_silent,/*!< in: if TRUE then don't print + any message to the log. */ + const char* file, /*!< in: file name */ + const ulint line); /*!< in: line */ + +/****************************************************************//** +Tries to enable the atomic write feature, if available, for the specified file +handle. +@return TRUE if success */ +static __attribute__((warn_unused_result)) +ibool +os_file_set_atomic_writes( +/*======================*/ + const char* name, /*!< in: name of the file */ + os_file_t file); /*!< in: handle to the file */ + +#ifdef UNIV_DEBUG +# ifndef UNIV_HOTBACKUP +/**********************************************************************//** +Validates the consistency the aio system some of the time. +@return TRUE if ok or the check was skipped */ +UNIV_INTERN +ibool +os_aio_validate_skip(void) +/*======================*/ +{ +/** Try os_aio_validate() every this many times */ +# define OS_AIO_VALIDATE_SKIP 13 + + /** The os_aio_validate() call skip counter. + Use a signed type because of the race condition below. */ + static int os_aio_validate_count = OS_AIO_VALIDATE_SKIP; + + /* There is a race condition below, but it does not matter, + because this call is only for heuristic purposes. We want to + reduce the call frequency of the costly os_aio_validate() + check in debug builds. */ + if (--os_aio_validate_count > 0) { + return(TRUE); + } + + os_aio_validate_count = OS_AIO_VALIDATE_SKIP; + return(os_aio_validate()); +} +# endif /* !UNIV_HOTBACKUP */ +#endif /* UNIV_DEBUG */ + +#ifdef _WIN32 +/** IO completion port used by background io threads */ +static HANDLE completion_port; +/** IO completion port used by background io READ threads */ +static HANDLE read_completion_port; +/** Thread local storage index for the per-thread event used for synchronous IO */ +static DWORD tls_sync_io = TLS_OUT_OF_INDEXES; +#endif + +#ifdef __WIN__ +/***********************************************************************//** +Gets the operating system version. Currently works only on Windows. +@return OS_WIN95, OS_WIN31, OS_WINNT, OS_WIN2000, OS_WINXP, OS_WINVISTA, +OS_WIN7. */ +UNIV_INTERN +ulint +os_get_os_version(void) +/*===================*/ +{ + OSVERSIONINFO os_info; + + os_info.dwOSVersionInfoSize = sizeof(OSVERSIONINFO); + + ut_a(GetVersionEx(&os_info)); + + if (os_info.dwPlatformId == VER_PLATFORM_WIN32s) { + return(OS_WIN31); + } else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_WINDOWS) { + return(OS_WIN95); + } else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_NT) { + switch (os_info.dwMajorVersion) { + case 3: + case 4: + return(OS_WINNT); + case 5: + return (os_info.dwMinorVersion == 0) + ? OS_WIN2000 : OS_WINXP; + case 6: + return (os_info.dwMinorVersion == 0) + ? OS_WINVISTA : OS_WIN7; + default: + return(OS_WIN7); + } + } else { + ut_error; + return(0); + } +} +#endif /* __WIN__ */ + + +#ifdef _WIN32 +/* +Windows : Handling synchronous IO on files opened asynchronously. + +If file is opened for asynchronous IO (FILE_FLAG_OVERLAPPED) and also bound to +a completion port, then every IO on this file would normally be enqueued to the +completion port. Sometimes however we would like to do a synchronous IO. This is +possible if we initialitze have overlapped.hEvent with a valid event and set its +lowest order bit to 1 (see MSDN ReadFile and WriteFile description for more info) + +We'll create this special event once for each thread and store in thread local +storage. +*/ + + +/***********************************************************************//** +Initialize tls index.for event handle used for synchronized IO on files that +might be opened with FILE_FLAG_OVERLAPPED. +*/ +static void win_init_syncio_event() +{ + tls_sync_io = TlsAlloc(); + ut_a(tls_sync_io != TLS_OUT_OF_INDEXES); +} + +/***********************************************************************//** +Retrieve per-thread event for doing synchronous io on asyncronously opened files +*/ +static HANDLE win_get_syncio_event() +{ + HANDLE h; + if(tls_sync_io == TLS_OUT_OF_INDEXES){ + win_init_syncio_event(); + } + + h = (HANDLE)TlsGetValue(tls_sync_io); + if (h) + return h; + h = CreateEventA(NULL, FALSE, FALSE, NULL); + ut_a(h); + h = (HANDLE)((uintptr_t)h | 1); + TlsSetValue(tls_sync_io, h); + return h; +} + +/* + TLS destructor, inspired by Chromium code + http://src.chromium.org/svn/trunk/src/base/threading/thread_local_storage_win.cc +*/ + +static void win_free_syncio_event() +{ + HANDLE h = win_get_syncio_event(); + if (h) { + CloseHandle(h); + } +} + +static void NTAPI win_tls_thread_exit(PVOID module, DWORD reason, PVOID reserved) { + if (DLL_THREAD_DETACH == reason || DLL_PROCESS_DETACH == reason) + win_free_syncio_event(); +} + +extern "C" { +#ifdef _WIN64 +#pragma comment(linker, "/INCLUDE:_tls_used") +#pragma comment(linker, "/INCLUDE:p_thread_callback_base") +#pragma const_seg(".CRT$XLB") +extern const PIMAGE_TLS_CALLBACK p_thread_callback_base; +const PIMAGE_TLS_CALLBACK p_thread_callback_base = win_tls_thread_exit; +#pragma data_seg() +#else +#pragma comment(linker, "/INCLUDE:__tls_used") +#pragma comment(linker, "/INCLUDE:_p_thread_callback_base") +#pragma data_seg(".CRT$XLB") +PIMAGE_TLS_CALLBACK p_thread_callback_base = win_tls_thread_exit; +#pragma data_seg() +#endif +} +#endif /*_WIN32 */ + +/***********************************************************************//** +Retrieves the last error number if an error occurs in a file io function. +The number should be retrieved before any other OS calls (because they may +overwrite the error number). If the number is not known to this program, +the OS error number + 100 is returned. +@return error number, or OS error number + 100 */ +static +ulint +os_file_get_last_error_low( +/*=======================*/ + bool report_all_errors, /*!< in: TRUE if we want an error + message printed of all errors */ + bool on_error_silent) /*!< in: TRUE then don't print any + diagnostic to the log */ +{ +#ifdef __WIN__ + + ulint err = (ulint) GetLastError(); + if (err == ERROR_SUCCESS) { + return(0); + } + + if (report_all_errors + || (!on_error_silent + && err != ERROR_DISK_FULL + && err != ERROR_FILE_EXISTS)) { + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Operating system error number %lu" + " in a file operation.\n", (ulong) err); + + if (err == ERROR_PATH_NOT_FOUND) { + fprintf(stderr, + "InnoDB: The error means the system" + " cannot find the path specified.\n"); + + if (srv_is_being_started) { + fprintf(stderr, + "InnoDB: If you are installing InnoDB," + " remember that you must create\n" + "InnoDB: directories yourself, InnoDB" + " does not create them.\n"); + } + } else if (err == ERROR_ACCESS_DENIED) { + fprintf(stderr, + "InnoDB: The error means mysqld does not have" + " the access rights to\n" + "InnoDB: the directory. It may also be" + " you have created a subdirectory\n" + "InnoDB: of the same name as a data file.\n"); + } else if (err == ERROR_SHARING_VIOLATION + || err == ERROR_LOCK_VIOLATION) { + fprintf(stderr, + "InnoDB: The error means that another program" + " is using InnoDB's files.\n" + "InnoDB: This might be a backup or antivirus" + " software or another instance\n" + "InnoDB: of MySQL." + " Please close it to get rid of this error.\n"); + } else if (err == ERROR_WORKING_SET_QUOTA + || err == ERROR_NO_SYSTEM_RESOURCES) { + fprintf(stderr, + "InnoDB: The error means that there are no" + " sufficient system resources or quota to" + " complete the operation.\n"); + } else if (err == ERROR_OPERATION_ABORTED) { + fprintf(stderr, + "InnoDB: The error means that the I/O" + " operation has been aborted\n" + "InnoDB: because of either a thread exit" + " or an application request.\n" + "InnoDB: Retry attempt is made.\n"); + } else if (err == ECANCELED || err == ENOTTY) { + if (strerror(err) != NULL) { + fprintf(stderr, + "InnoDB: Error number %d" + " means '%s'.\n", + err, strerror(err)); + } + + if(srv_use_atomic_writes) { + fprintf(stderr, + "InnoDB: Error trying to enable atomic writes on " + "non-supported destination!\n"); + } + } else { + fprintf(stderr, + "InnoDB: Some operating system error numbers" + " are described at\n" + "InnoDB: " + REFMAN + "operating-system-error-codes.html\n"); + } + } + + fflush(stderr); + + if (err == ERROR_FILE_NOT_FOUND) { + return(OS_FILE_NOT_FOUND); + } else if (err == ERROR_DISK_FULL) { + return(OS_FILE_DISK_FULL); + } else if (err == ERROR_FILE_EXISTS) { + return(OS_FILE_ALREADY_EXISTS); + } else if (err == ERROR_SHARING_VIOLATION + || err == ERROR_LOCK_VIOLATION) { + return(OS_FILE_SHARING_VIOLATION); + } else if (err == ERROR_WORKING_SET_QUOTA + || err == ERROR_NO_SYSTEM_RESOURCES) { + return(OS_FILE_INSUFFICIENT_RESOURCE); + } else if (err == ERROR_OPERATION_ABORTED) { + return(OS_FILE_OPERATION_ABORTED); + } else if (err == ERROR_ACCESS_DENIED) { + return(OS_FILE_ACCESS_VIOLATION); + } else { + return(OS_FILE_ERROR_MAX + err); + } +#else + int err = errno; + if (err == 0) { + return(0); + } + + if (report_all_errors + || (err != ENOSPC && err != EEXIST && !on_error_silent)) { + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Operating system error number %d" + " in a file operation.\n", err); + + if (err == ENOENT) { + fprintf(stderr, + "InnoDB: The error means the system" + " cannot find the path specified.\n"); + + if (srv_is_being_started) { + fprintf(stderr, + "InnoDB: If you are installing InnoDB," + " remember that you must create\n" + "InnoDB: directories yourself, InnoDB" + " does not create them.\n"); + } + } else if (err == EACCES) { + fprintf(stderr, + "InnoDB: The error means mysqld does not have" + " the access rights to\n" + "InnoDB: the directory.\n"); + } else if (err == ECANCELED || err == ENOTTY) { + if (strerror(err) != NULL) { + fprintf(stderr, + "InnoDB: Error number %d" + " means '%s'.\n", + err, strerror(err)); + } + + + if(srv_use_atomic_writes) { + fprintf(stderr, + "InnoDB: Error trying to enable atomic writes on " + "non-supported destination!\n"); + } + } else { + if (strerror(err) != NULL) { + fprintf(stderr, + "InnoDB: Error number %d" + " means '%s'.\n", + err, strerror(err)); + } + + + fprintf(stderr, + "InnoDB: Some operating system" + " error numbers are described at\n" + "InnoDB: " + REFMAN + "operating-system-error-codes.html\n"); + } + } + + fflush(stderr); + + switch (err) { + case ENOSPC: + return(OS_FILE_DISK_FULL); + case ENOENT: + return(OS_FILE_NOT_FOUND); + case EEXIST: + return(OS_FILE_ALREADY_EXISTS); + case EXDEV: + case ENOTDIR: + case EISDIR: + return(OS_FILE_PATH_ERROR); + case EAGAIN: + if (srv_use_native_aio) { + return(OS_FILE_AIO_RESOURCES_RESERVED); + } + break; + case ECANCELED: + case ENOTTY: + return(OS_FILE_OPERATION_NOT_SUPPORTED); + case EINTR: + if (srv_use_native_aio) { + return(OS_FILE_AIO_INTERRUPTED); + } + break; + case EACCES: + return(OS_FILE_ACCESS_VIOLATION); + } + return(OS_FILE_ERROR_MAX + err); +#endif +} + +/***********************************************************************//** +Retrieves the last error number if an error occurs in a file io function. +The number should be retrieved before any other OS calls (because they may +overwrite the error number). If the number is not known to this program, +the OS error number + 100 is returned. +@return error number, or OS error number + 100 */ +UNIV_INTERN +ulint +os_file_get_last_error( +/*===================*/ + bool report_all_errors) /*!< in: TRUE if we want an error + message printed of all errors */ +{ + return(os_file_get_last_error_low(report_all_errors, false)); +} + +/****************************************************************//** +Does error handling when a file operation fails. +Conditionally exits (calling exit(3)) based on should_exit value and the +error type, if should_exit is TRUE then on_error_silent is ignored. +@return TRUE if we should retry the operation */ +ibool +os_file_handle_error_cond_exit( +/*===========================*/ + const char* name, /*!< in: name of a file or NULL */ + const char* operation, /*!< in: operation */ + ibool should_exit, /*!< in: call exit(3) if unknown error + and this parameter is TRUE */ + ibool on_error_silent,/*!< in: if TRUE then don't print + any message to the log iff it is + an unknown non-fatal error */ + const char* file, /*!< in: file name */ + const ulint line) /*!< in: line */ +{ + ulint err; + + err = os_file_get_last_error_low(false, on_error_silent); + + switch (err) { + case OS_FILE_DISK_FULL: + /* We only print a warning about disk full once */ + + if (os_has_said_disk_full) { + + return(FALSE); + } + + /* Disk full error is reported irrespective of the + on_error_silent setting. */ + + if (name) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Encountered a problem with" + " file %s\n", name); + } + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Disk is full. Try to clean the disk" + " to free space.\n"); + + os_has_said_disk_full = TRUE; + + fprintf(stderr, + " InnoDB: at file %s and at line %ld\n", file, line); + + fflush(stderr); + + return(FALSE); + + case OS_FILE_AIO_RESOURCES_RESERVED: + case OS_FILE_AIO_INTERRUPTED: + + return(TRUE); + + case OS_FILE_PATH_ERROR: + case OS_FILE_ALREADY_EXISTS: + case OS_FILE_ACCESS_VIOLATION: + + return(FALSE); + + case OS_FILE_SHARING_VIOLATION: + + os_thread_sleep(10000000); /* 10 sec */ + return(TRUE); + + case OS_FILE_OPERATION_ABORTED: + case OS_FILE_INSUFFICIENT_RESOURCE: + + os_thread_sleep(100000); /* 100 ms */ + return(TRUE); + + default: + + /* If it is an operation that can crash on error then it + is better to ignore on_error_silent and print an error message + to the log. */ + + if (should_exit || !on_error_silent) { + fprintf(stderr, + " InnoDB: Operation %s to file %s and at line %ld\n", + operation, file, line); + } + + if (should_exit || !on_error_silent) { + ib_logf(IB_LOG_LEVEL_ERROR, "File %s: '%s' returned OS " + "error " ULINTPF ".%s", name ? name : "(unknown)", + operation, err, should_exit + ? " Cannot continue operation" : ""); + } + + if (should_exit) { + exit(1); + } + } + + return(FALSE); +} + +/****************************************************************//** +Does error handling when a file operation fails. +@return TRUE if we should retry the operation */ +static +ibool +os_file_handle_error( +/*=================*/ + const char* name, /*!< in: name of a file or NULL */ + const char* operation, /*!< in: operation */ + const char* file, /*!< in: file name */ + const ulint line) /*!< in: line */ +{ + /* exit in case of unknown error */ + return(os_file_handle_error_cond_exit(name, operation, TRUE, FALSE, file, line)); +} + +/****************************************************************//** +Does error handling when a file operation fails. +@return TRUE if we should retry the operation */ +ibool +os_file_handle_error_no_exit( +/*=========================*/ + const char* name, /*!< in: name of a file or NULL */ + const char* operation, /*!< in: operation */ + ibool on_error_silent,/*!< in: if TRUE then don't print + any message to the log. */ + const char* file, /*!< in: file name */ + const ulint line) /*!< in: line */ +{ + /* don't exit in case of unknown error */ + return(os_file_handle_error_cond_exit( + name, operation, FALSE, on_error_silent, file, line)); +} + +#undef USE_FILE_LOCK +#define USE_FILE_LOCK +#if defined(UNIV_HOTBACKUP) || defined(__WIN__) +/* InnoDB Hot Backup does not lock the data files. + * On Windows, mandatory locking is used. + */ +# undef USE_FILE_LOCK +#endif +#ifdef USE_FILE_LOCK +/****************************************************************//** +Obtain an exclusive lock on a file. +@return 0 on success */ +static +int +os_file_lock( +/*=========*/ + int fd, /*!< in: file descriptor */ + const char* name) /*!< in: file name */ +{ + struct flock lk; + + ut_ad(!srv_read_only_mode); + + lk.l_type = F_WRLCK; + lk.l_whence = SEEK_SET; + lk.l_start = lk.l_len = 0; + + if (fcntl(fd, F_SETLK, &lk) == -1) { + + ib_logf(IB_LOG_LEVEL_ERROR, + "Unable to lock %s, error: %d", name, errno); + + if (errno == EAGAIN || errno == EACCES) { + ib_logf(IB_LOG_LEVEL_INFO, + "Check that you do not already have " + "another mysqld process using the " + "same InnoDB data or log files."); + } + + return(-1); + } + + return(0); +} +#endif /* USE_FILE_LOCK */ + +#ifndef UNIV_HOTBACKUP +/****************************************************************//** +Creates the seek mutexes used in positioned reads and writes. */ +UNIV_INTERN +void +os_io_init_simple(void) +/*===================*/ +{ +#if !defined(HAVE_ATOMIC_BUILTINS) || UNIV_WORD_SIZE < 8 + os_file_count_mutex = os_mutex_create(); +#endif /* !HAVE_ATOMIC_BUILTINS || UNIV_WORD_SIZE < 8 */ + + for (ulint i = 0; i < OS_FILE_N_SEEK_MUTEXES; i++) { + os_file_seek_mutexes[i] = os_mutex_create(); + } +#ifdef _WIN32 + win_init_syncio_event(); +#endif +} + +/***********************************************************************//** +Creates a temporary file. This function is like tmpfile(3), but +the temporary file is created in the MySQL temporary directory. +@return temporary file handle, or NULL on error */ +UNIV_INTERN +FILE* +os_file_create_tmpfile(void) +/*========================*/ +{ + FILE* file = NULL; + int fd; + WAIT_ALLOW_WRITES(); + fd = innobase_mysql_tmpfile(); + + ut_ad(!srv_read_only_mode); + + if (fd >= 0) { + file = fdopen(fd, "w+b"); + } + + if (!file) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Error: unable to create temporary file;" + " errno: %d\n", errno); + if (fd >= 0) { + close(fd); + } + } + + return(file); +} +#endif /* !UNIV_HOTBACKUP */ + +/***********************************************************************//** +The os_file_opendir() function opens a directory stream corresponding to the +directory named by the dirname argument. The directory stream is positioned +at the first entry. In both Unix and Windows we automatically skip the '.' +and '..' items at the start of the directory listing. +@return directory stream, NULL if error */ +UNIV_INTERN +os_file_dir_t +os_file_opendir( +/*============*/ + const char* dirname, /*!< in: directory name; it must not + contain a trailing '\' or '/' */ + ibool error_is_fatal) /*!< in: TRUE if we should treat an + error as a fatal error; if we try to + open symlinks then we do not wish a + fatal error if it happens not to be + a directory */ +{ + os_file_dir_t dir; +#ifdef __WIN__ + LPWIN32_FIND_DATA lpFindFileData; + char path[OS_FILE_MAX_PATH + 3]; + + ut_a(strlen(dirname) < OS_FILE_MAX_PATH); + + strcpy(path, dirname); + strcpy(path + strlen(path), "\\*"); + + /* Note that in Windows opening the 'directory stream' also retrieves + the first entry in the directory. Since it is '.', that is no problem, + as we will skip over the '.' and '..' entries anyway. */ + + lpFindFileData = static_cast( + ut_malloc(sizeof(WIN32_FIND_DATA))); + + dir = FindFirstFile((LPCTSTR) path, lpFindFileData); + + ut_free(lpFindFileData); + + if (dir == INVALID_HANDLE_VALUE) { + + if (error_is_fatal) { + os_file_handle_error(dirname, "opendir", __FILE__, __LINE__); + } + + return(NULL); + } + + return(dir); +#else + dir = opendir(dirname); + + if (dir == NULL && error_is_fatal) { + os_file_handle_error(dirname, "opendir", __FILE__, __LINE__); + } + + return(dir); +#endif /* __WIN__ */ +} + +/***********************************************************************//** +Closes a directory stream. +@return 0 if success, -1 if failure */ +UNIV_INTERN +int +os_file_closedir( +/*=============*/ + os_file_dir_t dir) /*!< in: directory stream */ +{ +#ifdef __WIN__ + BOOL ret; + + ret = FindClose(dir); + + if (!ret) { + os_file_handle_error_no_exit(NULL, "closedir", FALSE, __FILE__, __LINE__); + + return(-1); + } + + return(0); +#else + int ret; + + ret = closedir(dir); + + if (ret) { + os_file_handle_error_no_exit(NULL, "closedir", FALSE, __FILE__, __LINE__); + } + + return(ret); +#endif /* __WIN__ */ +} + +/***********************************************************************//** +This function returns information of the next file in the directory. We jump +over the '.' and '..' entries in the directory. +@return 0 if ok, -1 if error, 1 if at the end of the directory */ +UNIV_INTERN +int +os_file_readdir_next_file( +/*======================*/ + const char* dirname,/*!< in: directory name or path */ + os_file_dir_t dir, /*!< in: directory stream */ + os_file_stat_t* info) /*!< in/out: buffer where the info is returned */ +{ +#ifdef __WIN__ + LPWIN32_FIND_DATA lpFindFileData; + BOOL ret; + + lpFindFileData = static_cast( + ut_malloc(sizeof(WIN32_FIND_DATA))); +next_file: + ret = FindNextFile(dir, lpFindFileData); + + if (ret) { + ut_a(strlen((char*) lpFindFileData->cFileName) + < OS_FILE_MAX_PATH); + + if (strcmp((char*) lpFindFileData->cFileName, ".") == 0 + || strcmp((char*) lpFindFileData->cFileName, "..") == 0) { + + goto next_file; + } + + strcpy(info->name, (char*) lpFindFileData->cFileName); + + info->size = (ib_int64_t)(lpFindFileData->nFileSizeLow) + + (((ib_int64_t)(lpFindFileData->nFileSizeHigh)) + << 32); + + if (lpFindFileData->dwFileAttributes + & FILE_ATTRIBUTE_REPARSE_POINT) { + /* TODO: test Windows symlinks */ + /* TODO: MySQL has apparently its own symlink + implementation in Windows, dbname.sym can + redirect a database directory: + REFMAN "windows-symbolic-links.html" */ + info->type = OS_FILE_TYPE_LINK; + } else if (lpFindFileData->dwFileAttributes + & FILE_ATTRIBUTE_DIRECTORY) { + info->type = OS_FILE_TYPE_DIR; + } else { + /* It is probably safest to assume that all other + file types are normal. Better to check them rather + than blindly skip them. */ + + info->type = OS_FILE_TYPE_FILE; + } + } + + ut_free(lpFindFileData); + + if (ret) { + return(0); + } else if (GetLastError() == ERROR_NO_MORE_FILES) { + + return(1); + } else { + os_file_handle_error_no_exit(NULL, "readdir_next_file", FALSE, __FILE__, __LINE__); + return(-1); + } +#else + struct dirent* ent; + char* full_path; + int ret; + struct stat statinfo; +#ifdef HAVE_READDIR_R + char dirent_buf[sizeof(struct dirent) + + _POSIX_PATH_MAX + 100]; + /* In /mysys/my_lib.c, _POSIX_PATH_MAX + 1 is used as + the max file name len; but in most standards, the + length is NAME_MAX; we add 100 to be even safer */ +#endif + +next_file: + +#ifdef HAVE_READDIR_R + ret = readdir_r(dir, (struct dirent*) dirent_buf, &ent); + + if (ret != 0 +#ifdef UNIV_AIX + /* On AIX, only if we got non-NULL 'ent' (result) value and + a non-zero 'ret' (return) value, it indicates a failed + readdir_r() call. An NULL 'ent' with an non-zero 'ret' + would indicate the "end of the directory" is reached. */ + && ent != NULL +#endif + ) { + fprintf(stderr, + "InnoDB: cannot read directory %s, error %lu\n", + dirname, (ulong) ret); + + return(-1); + } + + if (ent == NULL) { + /* End of directory */ + + return(1); + } + + ut_a(strlen(ent->d_name) < _POSIX_PATH_MAX + 100 - 1); +#else + ent = readdir(dir); + + if (ent == NULL) { + + return(1); + } +#endif + ut_a(strlen(ent->d_name) < OS_FILE_MAX_PATH); + + if (strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0) { + + goto next_file; + } + + strcpy(info->name, ent->d_name); + + full_path = static_cast( + ut_malloc(strlen(dirname) + strlen(ent->d_name) + 10)); + + sprintf(full_path, "%s/%s", dirname, ent->d_name); + + ret = stat(full_path, &statinfo); + + if (ret) { + + if (errno == ENOENT) { + /* readdir() returned a file that does not exist, + it must have been deleted in the meantime. Do what + would have happened if the file was deleted before + readdir() - ignore and go to the next entry. + If this is the last entry then info->name will still + contain the name of the deleted file when this + function returns, but this is not an issue since the + caller shouldn't be looking at info when end of + directory is returned. */ + + ut_free(full_path); + + goto next_file; + } + + os_file_handle_error_no_exit(full_path, "stat", FALSE, __FILE__, __LINE__); + + ut_free(full_path); + + return(-1); + } + + info->size = (ib_int64_t) statinfo.st_size; + + if (S_ISDIR(statinfo.st_mode)) { + info->type = OS_FILE_TYPE_DIR; + } else if (S_ISLNK(statinfo.st_mode)) { + info->type = OS_FILE_TYPE_LINK; + } else if (S_ISREG(statinfo.st_mode)) { + info->type = OS_FILE_TYPE_FILE; + } else { + info->type = OS_FILE_TYPE_UNKNOWN; + } + + ut_free(full_path); + + return(0); +#endif +} + +/*****************************************************************//** +This function attempts to create a directory named pathname. The new +directory gets default permissions. On Unix the permissions are +(0770 & ~umask). If the directory exists already, nothing is done and +the call succeeds, unless the fail_if_exists arguments is true. +If another error occurs, such as a permission error, this does not crash, +but reports the error and returns FALSE. +@return TRUE if call succeeds, FALSE on error */ +UNIV_INTERN +ibool +os_file_create_directory( +/*=====================*/ + const char* pathname, /*!< in: directory name as + null-terminated string */ + ibool fail_if_exists) /*!< in: if TRUE, pre-existing directory + is treated as an error. */ +{ +#ifdef __WIN__ + BOOL rcode; + + rcode = CreateDirectory((LPCTSTR) pathname, NULL); + if (!(rcode != 0 + || (GetLastError() == ERROR_ALREADY_EXISTS + && !fail_if_exists))) { + + os_file_handle_error_no_exit( + pathname, "CreateDirectory", FALSE, __FILE__, __LINE__); + + return(FALSE); + } + + return(TRUE); +#else + int rcode; + WAIT_ALLOW_WRITES(); + + rcode = mkdir(pathname, 0770); + + if (!(rcode == 0 || (errno == EEXIST && !fail_if_exists))) { + /* failure */ + os_file_handle_error_no_exit(pathname, "mkdir", FALSE, __FILE__, __LINE__); + + return(FALSE); + } + + return (TRUE); +#endif /* __WIN__ */ +} + +/****************************************************************//** +NOTE! Use the corresponding macro os_file_create_simple(), not directly +this function! +A simple function to open or create a file. +@return own: handle to the file, not defined if error, error number +can be retrieved with os_file_get_last_error */ +UNIV_INTERN +os_file_t +os_file_create_simple_func( +/*=======================*/ + const char* name, /*!< in: name of the file or path as a + null-terminated string */ + ulint create_mode,/*!< in: create mode */ + ulint access_type,/*!< in: OS_FILE_READ_ONLY or + OS_FILE_READ_WRITE */ + ibool* success)/*!< out: TRUE if succeed, FALSE if error */ +{ + os_file_t file; + ibool retry; + + *success = FALSE; +#ifdef __WIN__ + DWORD access; + DWORD create_flag; + DWORD attributes = 0; + + ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT)); + ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT)); + + if (create_mode == OS_FILE_OPEN) { + + create_flag = OPEN_EXISTING; + + } else if (srv_read_only_mode) { + + create_flag = OPEN_EXISTING; + + } else if (create_mode == OS_FILE_CREATE) { + + create_flag = CREATE_NEW; + + } else if (create_mode == OS_FILE_CREATE_PATH) { + + ut_a(!srv_read_only_mode); + + /* Create subdirs along the path if needed */ + *success = os_file_create_subdirs_if_needed(name); + + if (!*success) { + + ib_logf(IB_LOG_LEVEL_ERROR, + "Unable to create subdirectories '%s'", + name); + + return((os_file_t) -1); + } + + create_flag = CREATE_NEW; + create_mode = OS_FILE_CREATE; + + } else { + ib_logf(IB_LOG_LEVEL_ERROR, + "Unknown file create mode (%lu) for file '%s'", + create_mode, name); + + return((os_file_t) -1); + } + + if (access_type == OS_FILE_READ_ONLY) { + access = GENERIC_READ; + } else if (srv_read_only_mode) { + + ib_logf(IB_LOG_LEVEL_INFO, + "read only mode set. Unable to " + "open file '%s' in RW mode, trying RO mode", name); + + access = GENERIC_READ; + + } else if (access_type == OS_FILE_READ_WRITE) { + access = GENERIC_READ | GENERIC_WRITE; + } else { + ib_logf(IB_LOG_LEVEL_ERROR, + "Unknown file access type (%lu) for file '%s'", + access_type, name); + + return((os_file_t) -1); + } + + do { + /* Use default security attributes and no template file. */ + + file = CreateFile( + (LPCTSTR) name, access, FILE_SHARE_READ, NULL, + create_flag, attributes, NULL); + + if (file == INVALID_HANDLE_VALUE) { + + *success = FALSE; + + retry = os_file_handle_error( + name, create_mode == OS_FILE_OPEN ? + "open" : "create", __FILE__, __LINE__); + + } else { + *success = TRUE; + retry = false; + } + + } while (retry); + +#else /* __WIN__ */ + int create_flag; + if (create_mode != OS_FILE_OPEN && create_mode != OS_FILE_OPEN_RAW) + WAIT_ALLOW_WRITES(); + + ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT)); + ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT)); + + if (create_mode == OS_FILE_OPEN) { + + if (access_type == OS_FILE_READ_ONLY) { + create_flag = O_RDONLY; + } else if (srv_read_only_mode) { + create_flag = O_RDONLY; + } else { + create_flag = O_RDWR; + } + + } else if (srv_read_only_mode) { + + create_flag = O_RDONLY; + + } else if (create_mode == OS_FILE_CREATE) { + + create_flag = O_RDWR | O_CREAT | O_EXCL; + + } else if (create_mode == OS_FILE_CREATE_PATH) { + + /* Create subdirs along the path if needed */ + + *success = os_file_create_subdirs_if_needed(name); + + if (!*success) { + + ib_logf(IB_LOG_LEVEL_ERROR, + "Unable to create subdirectories '%s'", + name); + + return((os_file_t) -1); + } + + create_flag = O_RDWR | O_CREAT | O_EXCL; + create_mode = OS_FILE_CREATE; + } else { + + ib_logf(IB_LOG_LEVEL_ERROR, + "Unknown file create mode (%lu) for file '%s'", + create_mode, name); + + return((os_file_t) -1); + } + + do { + file = ::open(name, create_flag, os_innodb_umask); + + if (file == -1) { + *success = FALSE; + + retry = os_file_handle_error( + name, + create_mode == OS_FILE_OPEN + ? "open" : "create", __FILE__, __LINE__); + } else { + *success = TRUE; + retry = false; + } + + } while (retry); + +#ifdef USE_FILE_LOCK + if (!srv_read_only_mode + && *success + && access_type == OS_FILE_READ_WRITE + && os_file_lock(file, name)) { + + *success = FALSE; + close(file); + file = -1; + } +#endif /* USE_FILE_LOCK */ + +#endif /* __WIN__ */ + + return(file); +} + +/****************************************************************//** +NOTE! Use the corresponding macro +os_file_create_simple_no_error_handling(), not directly this function! +A simple function to open or create a file. +@return own: handle to the file, not defined if error, error number +can be retrieved with os_file_get_last_error */ +UNIV_INTERN +os_file_t +os_file_create_simple_no_error_handling_func( +/*=========================================*/ + const char* name, /*!< in: name of the file or path as a + null-terminated string */ + ulint create_mode,/*!< in: create mode */ + ulint access_type,/*!< in: OS_FILE_READ_ONLY, + OS_FILE_READ_WRITE, or + OS_FILE_READ_ALLOW_DELETE; the last option is + used by a backup program reading the file */ + ibool* success,/*!< out: TRUE if succeed, FALSE if error */ + ulint atomic_writes) /*! in: atomic writes table option + value */ +{ + os_file_t file; + atomic_writes_t awrites = (atomic_writes_t) atomic_writes; + + *success = FALSE; +#ifdef __WIN__ + DWORD access; + DWORD create_flag; + DWORD attributes = 0; + DWORD share_mode = FILE_SHARE_READ; + + ut_a(name); + + ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT)); + ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT)); + + if (create_mode == OS_FILE_OPEN) { + create_flag = OPEN_EXISTING; + } else if (srv_read_only_mode) { + create_flag = OPEN_EXISTING; + } else if (create_mode == OS_FILE_CREATE) { + create_flag = CREATE_NEW; + } else { + + ib_logf(IB_LOG_LEVEL_ERROR, + "Unknown file create mode (%lu) for file '%s'", + create_mode, name); + + return((os_file_t) -1); + } + + if (access_type == OS_FILE_READ_ONLY) { + access = GENERIC_READ; + } else if (srv_read_only_mode) { + access = GENERIC_READ; + } else if (access_type == OS_FILE_READ_WRITE) { + access = GENERIC_READ | GENERIC_WRITE; + } else if (access_type == OS_FILE_READ_ALLOW_DELETE) { + + ut_a(!srv_read_only_mode); + + access = GENERIC_READ; + + /*!< A backup program has to give mysqld the maximum + freedom to do what it likes with the file */ + + share_mode |= FILE_SHARE_DELETE | FILE_SHARE_WRITE; + } else { + ib_logf(IB_LOG_LEVEL_ERROR, + "Unknown file access type (%lu) for file '%s'", + access_type, name); + + return((os_file_t) -1); + } + + file = CreateFile((LPCTSTR) name, + access, + share_mode, + NULL, // Security attributes + create_flag, + attributes, + NULL); // No template file + + /* If we have proper file handle and atomic writes should be used, + try to set atomic writes and if that fails when creating a new + table, produce a error. If atomic writes are used on existing + file, ignore error and use traditional writes for that file */ + if (file != INVALID_HANDLE_VALUE + && (awrites == ATOMIC_WRITES_ON || + (srv_use_atomic_writes && awrites == ATOMIC_WRITES_DEFAULT)) + && !os_file_set_atomic_writes(name, file)) { + if (create_mode == OS_FILE_CREATE) { + fprintf(stderr, "InnoDB: Error: Can't create file using atomic writes\n"); + CloseHandle(file); + os_file_delete_if_exists_func(name); + *success = FALSE; + file = INVALID_HANDLE_VALUE; + } + } + + *success = (file != INVALID_HANDLE_VALUE); +#else /* __WIN__ */ + int create_flag; + + ut_a(name); + if (create_mode != OS_FILE_OPEN && create_mode != OS_FILE_OPEN_RAW) + WAIT_ALLOW_WRITES(); + + ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT)); + ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT)); + + if (create_mode == OS_FILE_OPEN) { + + if (access_type == OS_FILE_READ_ONLY) { + + create_flag = O_RDONLY; + + } else if (srv_read_only_mode) { + + create_flag = O_RDONLY; + + } else { + + ut_a(access_type == OS_FILE_READ_WRITE + || access_type == OS_FILE_READ_ALLOW_DELETE); + + create_flag = O_RDWR; + } + + } else if (srv_read_only_mode) { + + create_flag = O_RDONLY; + + } else if (create_mode == OS_FILE_CREATE) { + + create_flag = O_RDWR | O_CREAT | O_EXCL; + + } else { + ib_logf(IB_LOG_LEVEL_ERROR, + "Unknown file create mode (%lu) for file '%s'", + create_mode, name); + + return((os_file_t) -1); + } + + file = ::open(name, create_flag, os_innodb_umask); + + *success = file == -1 ? FALSE : TRUE; + +#ifdef USE_FILE_LOCK + if (!srv_read_only_mode + && *success + && access_type == OS_FILE_READ_WRITE + && os_file_lock(file, name)) { + + *success = FALSE; + close(file); + file = -1; + + } +#endif /* USE_FILE_LOCK */ + + /* If we have proper file handle and atomic writes should be used, + try to set atomic writes and if that fails when creating a new + table, produce a error. If atomic writes are used on existing + file, ignore error and use traditional writes for that file */ + if (file != -1 + && (awrites == ATOMIC_WRITES_ON || + (srv_use_atomic_writes && awrites == ATOMIC_WRITES_DEFAULT)) + && !os_file_set_atomic_writes(name, file)) { + if (create_mode == OS_FILE_CREATE) { + fprintf(stderr, "InnoDB: Error: Can't create file using atomic writes\n"); + close(file); + os_file_delete_if_exists_func(name); + *success = FALSE; + file = -1; + } + } + +#endif /* __WIN__ */ + + return(file); +} + +/****************************************************************//** +Tries to disable OS caching on an opened file descriptor. */ +UNIV_INTERN +void +os_file_set_nocache( +/*================*/ + int fd /*!< in: file descriptor to alter */ + __attribute__((unused)), + const char* file_name /*!< in: used in the diagnostic + message */ + __attribute__((unused)), + const char* operation_name __attribute__((unused))) + /*!< in: "open" or "create"; used + in the diagnostic message */ +{ + /* some versions of Solaris may not have DIRECTIO_ON */ +#if defined(UNIV_SOLARIS) && defined(DIRECTIO_ON) + if (directio(fd, DIRECTIO_ON) == -1) { + int errno_save = errno; + + ib_logf(IB_LOG_LEVEL_ERROR, + "Failed to set DIRECTIO_ON on file %s: %s: %s, " + "continuing anyway.", + file_name, operation_name, strerror(errno_save)); + } +#elif defined(O_DIRECT) + if (fcntl(fd, F_SETFL, O_DIRECT) == -1) { + int errno_save = errno; + static bool warning_message_printed = false; + if (errno_save == EINVAL) { + if (!warning_message_printed) { + warning_message_printed = true; +# ifdef UNIV_LINUX + ib_logf(IB_LOG_LEVEL_WARN, + "Failed to set O_DIRECT on file " + "%s: %s: %s, continuing anyway. " + "O_DIRECT is known to result " + "in 'Invalid argument' on Linux on " + "tmpfs, see MySQL Bug#26662.", + file_name, operation_name, + strerror(errno_save)); +# else /* UNIV_LINUX */ + goto short_warning; +# endif /* UNIV_LINUX */ + } + } else { +# ifndef UNIV_LINUX +short_warning: +# endif + ib_logf(IB_LOG_LEVEL_WARN, + "Failed to set O_DIRECT on file %s: %s: %s, " + "continuing anyway.", + file_name, operation_name, strerror(errno_save)); + } + } +#endif /* defined(UNIV_SOLARIS) && defined(DIRECTIO_ON) */ +} + + +/****************************************************************//** +Tries to enable the atomic write feature, if available, for the specified file +handle. +@return TRUE if success */ +static __attribute__((warn_unused_result)) +ibool +os_file_set_atomic_writes( +/*======================*/ + const char* name /*!< in: name of the file */ + __attribute__((unused)), + os_file_t file /*!< in: handle to the file */ + __attribute__((unused))) + +{ +#ifdef DFS_IOCTL_ATOMIC_WRITE_SET + int atomic_option = 1; + + if (ioctl(file, DFS_IOCTL_ATOMIC_WRITE_SET, &atomic_option)) { + + fprintf(stderr, "InnoDB: Warning:Trying to enable atomic writes on " + "file %s on non-supported platform!\n", name); + os_file_handle_error_no_exit(name, "ioctl(DFS_IOCTL_ATOMIC_WRITE_SET)", FALSE, __FILE__, __LINE__); + return(FALSE); + } + + return(TRUE); +#else + fprintf(stderr, "InnoDB: Error: trying to enable atomic writes on " + "file %s on non-supported platform!\n", name); + return(FALSE); +#endif +} + +/****************************************************************//** +NOTE! Use the corresponding macro os_file_create(), not directly +this function! +Opens an existing file or creates a new. +@return own: handle to the file, not defined if error, error number +can be retrieved with os_file_get_last_error */ +UNIV_INTERN +os_file_t +os_file_create_func( +/*================*/ + const char* name, /*!< in: name of the file or path as a + null-terminated string */ + ulint create_mode,/*!< in: create mode */ + ulint purpose,/*!< in: OS_FILE_AIO, if asynchronous, + non-buffered i/o is desired, + OS_FILE_NORMAL, if any normal file; + NOTE that it also depends on type, os_aio_.. + and srv_.. variables whether we really use + async i/o or unbuffered i/o: look in the + function source code for the exact rules */ + ulint type, /*!< in: OS_DATA_FILE or OS_LOG_FILE */ + ibool* success,/*!< out: TRUE if succeed, FALSE if error */ + ulint atomic_writes) /*! in: atomic writes table option + value */ +{ + os_file_t file; + ibool retry; + ibool on_error_no_exit; + ibool on_error_silent; + atomic_writes_t awrites = (atomic_writes_t) atomic_writes; + +#ifdef __WIN__ + DBUG_EXECUTE_IF( + "ib_create_table_fail_disk_full", + *success = FALSE; + SetLastError(ERROR_DISK_FULL); + return((os_file_t) -1); + ); +#else /* __WIN__ */ + DBUG_EXECUTE_IF( + "ib_create_table_fail_disk_full", + *success = FALSE; + errno = ENOSPC; + return((os_file_t) -1); + ); +#endif /* __WIN__ */ + +#ifdef __WIN__ + DWORD create_flag; + DWORD share_mode = FILE_SHARE_READ; + + on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT + ? TRUE : FALSE; + + on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT + ? TRUE : FALSE; + + create_mode &= ~OS_FILE_ON_ERROR_NO_EXIT; + create_mode &= ~OS_FILE_ON_ERROR_SILENT; + + if (create_mode == OS_FILE_OPEN_RAW) { + + ut_a(!srv_read_only_mode); + + create_flag = OPEN_EXISTING; + + /* On Windows Physical devices require admin privileges and + have to have the write-share mode set. See the remarks + section for the CreateFile() function documentation in MSDN. */ + + share_mode |= FILE_SHARE_WRITE; + + } else if (create_mode == OS_FILE_OPEN + || create_mode == OS_FILE_OPEN_RETRY) { + + create_flag = OPEN_EXISTING; + + } else if (srv_read_only_mode) { + + create_flag = OPEN_EXISTING; + + } else if (create_mode == OS_FILE_CREATE) { + + create_flag = CREATE_NEW; + + } else if (create_mode == OS_FILE_OVERWRITE) { + + create_flag = CREATE_ALWAYS; + + } else { + ib_logf(IB_LOG_LEVEL_ERROR, + "Unknown file create mode (%lu) for file '%s'", + create_mode, name); + + return((os_file_t) -1); + } + + DWORD attributes = 0; + +#ifdef UNIV_HOTBACKUP + attributes |= FILE_FLAG_NO_BUFFERING; +#else + if (purpose == OS_FILE_AIO) { + +#ifdef WIN_ASYNC_IO + /* If specified, use asynchronous (overlapped) io and no + buffering of writes in the OS */ + + if (srv_use_native_aio) { + attributes |= FILE_FLAG_OVERLAPPED; + } +#endif /* WIN_ASYNC_IO */ + + } else if (purpose == OS_FILE_NORMAL) { + /* Use default setting. */ + } else { + ib_logf(IB_LOG_LEVEL_ERROR, + "Unknown purpose flag (%lu) while opening file '%s'", + purpose, name); + + return((os_file_t)(-1)); + } + +#ifdef UNIV_NON_BUFFERED_IO + // TODO: Create a bug, this looks wrong. The flush log + // parameter is dynamic. + if (type == OS_LOG_FILE && thd_flush_log_at_trx_commit(NULL) == 2) { + + /* Do not use unbuffered i/o for the log files because + value 2 denotes that we do not flush the log at every + commit, but only once per second */ + + } else if (srv_win_file_flush_method == SRV_WIN_IO_UNBUFFERED) { + + attributes |= FILE_FLAG_NO_BUFFERING; + } +#endif /* UNIV_NON_BUFFERED_IO */ + +#endif /* UNIV_HOTBACKUP */ + DWORD access = GENERIC_READ; + + if (!srv_read_only_mode) { + access |= GENERIC_WRITE; + } + + if (type == OS_LOG_FILE) { + if (srv_unix_file_flush_method == SRV_UNIX_O_DSYNC) { + /* Map O_DSYNC to WRITE_THROUGH */ + attributes |= FILE_FLAG_WRITE_THROUGH; + } else if (srv_unix_file_flush_method == SRV_UNIX_ALL_O_DIRECT) { + /* Open log file without buffering */ + attributes |= FILE_FLAG_NO_BUFFERING; + } + } + + do { + /* Use default security attributes and no template file. */ + file = CreateFile( + (LPCTSTR) name, access, share_mode, NULL, + create_flag, attributes, NULL); + + if (file == INVALID_HANDLE_VALUE) { + const char* operation; + + operation = (create_mode == OS_FILE_CREATE + && !srv_read_only_mode) + ? "create" : "open"; + + *success = FALSE; + + if (on_error_no_exit) { + retry = os_file_handle_error_no_exit( + name, operation, on_error_silent, __FILE__, __LINE__); + } else { + retry = os_file_handle_error(name, operation, __FILE__, __LINE__); + } + } else { + *success = TRUE; + retry = FALSE; + if (srv_use_native_aio && ((attributes & FILE_FLAG_OVERLAPPED) != 0)) { + ut_a(CreateIoCompletionPort(file, completion_port, 0, 0)); + } + } + + } while (retry); + + /* If we have proper file handle and atomic writes should be used, + try to set atomic writes and if that fails when creating a new + table, produce a error. If atomic writes are used on existing + file, ignore error and use traditional writes for that file */ + if (file != INVALID_HANDLE_VALUE + && (awrites == ATOMIC_WRITES_ON || + (srv_use_atomic_writes && awrites == ATOMIC_WRITES_DEFAULT)) + && !os_file_set_atomic_writes(name, file)) { + if (create_mode == OS_FILE_CREATE) { + fprintf(stderr, "InnoDB: Error: Can't create file using atomic writes\n"); + CloseHandle(file); + os_file_delete_if_exists_func(name); + *success = FALSE; + file = INVALID_HANDLE_VALUE; + } + } + +#else /* __WIN__ */ + int create_flag; + const char* mode_str = NULL; + if (create_mode != OS_FILE_OPEN && create_mode != OS_FILE_OPEN_RAW) + WAIT_ALLOW_WRITES(); + + on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT + ? TRUE : FALSE; + on_error_silent = create_mode & OS_FILE_ON_ERROR_SILENT + ? TRUE : FALSE; + + create_mode &= ~OS_FILE_ON_ERROR_NO_EXIT; + create_mode &= ~OS_FILE_ON_ERROR_SILENT; + + if (create_mode == OS_FILE_OPEN + || create_mode == OS_FILE_OPEN_RAW + || create_mode == OS_FILE_OPEN_RETRY) { + + mode_str = "OPEN"; + + create_flag = srv_read_only_mode ? O_RDONLY : O_RDWR; + + } else if (srv_read_only_mode) { + + mode_str = "OPEN"; + + create_flag = O_RDONLY; + + } else if (create_mode == OS_FILE_CREATE) { + + mode_str = "CREATE"; + create_flag = O_RDWR | O_CREAT | O_EXCL; + + } else if (create_mode == OS_FILE_OVERWRITE) { + + mode_str = "OVERWRITE"; + create_flag = O_RDWR | O_CREAT | O_TRUNC; + + } else { + ib_logf(IB_LOG_LEVEL_ERROR, + "Unknown file create mode (%lu) for file '%s'", + create_mode, name); + + return((os_file_t) -1); + } + + ut_a(type == OS_LOG_FILE || type == OS_DATA_FILE); + ut_a(purpose == OS_FILE_AIO || purpose == OS_FILE_NORMAL); + +#ifdef O_SYNC + /* We let O_SYNC only affect log files; note that we map O_DSYNC to + O_SYNC because the datasync options seemed to corrupt files in 2001 + in both Linux and Solaris */ + + if (!srv_read_only_mode + && type == OS_LOG_FILE + && srv_unix_file_flush_method == SRV_UNIX_O_DSYNC) { + + create_flag |= O_SYNC; + } +#endif /* O_SYNC */ + + do { + file = ::open(name, create_flag, os_innodb_umask); + + if (file == -1) { + const char* operation; + + operation = (create_mode == OS_FILE_CREATE + && !srv_read_only_mode) + ? "create" : "open"; + + *success = FALSE; + + if (on_error_no_exit) { + retry = os_file_handle_error_no_exit( + name, operation, on_error_silent, __FILE__, __LINE__); + } else { + retry = os_file_handle_error(name, operation, __FILE__, __LINE__); + } + } else { + *success = TRUE; + retry = false; + } + + } while (retry); + + if (!srv_read_only_mode + && *success + && type != OS_LOG_FILE + && (srv_unix_file_flush_method == SRV_UNIX_O_DIRECT + || srv_unix_file_flush_method == SRV_UNIX_O_DIRECT_NO_FSYNC)) { + + os_file_set_nocache(file, name, mode_str); + } else if (!srv_read_only_mode + && *success + && srv_unix_file_flush_method == SRV_UNIX_ALL_O_DIRECT) { + os_file_set_nocache(file, name, mode_str); + } + +#ifdef USE_FILE_LOCK + if (!srv_read_only_mode + && *success + && create_mode != OS_FILE_OPEN_RAW + && os_file_lock(file, name)) { + + if (create_mode == OS_FILE_OPEN_RETRY) { + + ut_a(!srv_read_only_mode); + + ib_logf(IB_LOG_LEVEL_INFO, + "Retrying to lock the first data file"); + + for (int i = 0; i < 100; i++) { + os_thread_sleep(1000000); + + if (!os_file_lock(file, name)) { + *success = TRUE; + return(file); + } + } + + ib_logf(IB_LOG_LEVEL_INFO, + "Unable to open the first data file"); + } + + *success = FALSE; + close(file); + file = -1; + } +#endif /* USE_FILE_LOCK */ + + /* If we have proper file handle and atomic writes should be used, + try to set atomic writes and if that fails when creating a new + table, produce a error. If atomic writes are used on existing + file, ignore error and use traditional writes for that file */ + if (file != -1 + && (awrites == ATOMIC_WRITES_ON || + (srv_use_atomic_writes && awrites == ATOMIC_WRITES_DEFAULT)) + && !os_file_set_atomic_writes(name, file)) { + if (create_mode == OS_FILE_CREATE) { + fprintf(stderr, "InnoDB: Error: Can't create file using atomic writes\n"); + close(file); + os_file_delete_if_exists_func(name); + *success = FALSE; + file = -1; + } + } + + +#endif /* __WIN__ */ + + return(file); +} + +/***********************************************************************//** +Deletes a file if it exists. The file has to be closed before calling this. +@return TRUE if success */ +UNIV_INTERN +bool +os_file_delete_if_exists_func( +/*==========================*/ + const char* name) /*!< in: file path as a null-terminated + string */ +{ +#ifdef __WIN__ + bool ret; + ulint count = 0; +loop: + /* In Windows, deleting an .ibd file may fail if ibbackup is copying + it */ + + ret = DeleteFile((LPCTSTR) name); + + if (ret) { + return(true); + } + + DWORD lasterr = GetLastError(); + if (lasterr == ERROR_FILE_NOT_FOUND + || lasterr == ERROR_PATH_NOT_FOUND) { + /* the file does not exist, this not an error */ + + return(true); + } + + count++; + + if (count > 100 && 0 == (count % 10)) { + os_file_get_last_error(true); /* print error information */ + + ib_logf(IB_LOG_LEVEL_WARN, "Delete of file %s failed.", name); + } + + os_thread_sleep(1000000); /* sleep for a second */ + + if (count > 2000) { + + return(false); + } + + goto loop; +#else + int ret; + WAIT_ALLOW_WRITES(); + + ret = unlink(name); + + if (ret != 0 && errno != ENOENT) { + os_file_handle_error_no_exit(name, "delete", FALSE, __FILE__, __LINE__); + + return(false); + } + + return(true); +#endif /* __WIN__ */ +} + +/***********************************************************************//** +Deletes a file. The file has to be closed before calling this. +@return TRUE if success */ +UNIV_INTERN +bool +os_file_delete_func( +/*================*/ + const char* name) /*!< in: file path as a null-terminated + string */ +{ +#ifdef __WIN__ + BOOL ret; + ulint count = 0; +loop: + /* In Windows, deleting an .ibd file may fail if ibbackup is copying + it */ + + ret = DeleteFile((LPCTSTR) name); + + if (ret) { + return(true); + } + + if (GetLastError() == ERROR_FILE_NOT_FOUND) { + /* If the file does not exist, we classify this as a 'mild' + error and return */ + + return(false); + } + + count++; + + if (count > 100 && 0 == (count % 10)) { + os_file_get_last_error(true); /* print error information */ + + fprintf(stderr, + "InnoDB: Warning: cannot delete file %s\n" + "InnoDB: Are you running ibbackup" + " to back up the file?\n", name); + } + + os_thread_sleep(1000000); /* sleep for a second */ + + if (count > 2000) { + + return(false); + } + + goto loop; +#else + int ret; + WAIT_ALLOW_WRITES(); + + ret = unlink(name); + + if (ret != 0) { + os_file_handle_error_no_exit(name, "delete", FALSE, __FILE__, __LINE__); + + return(false); + } + + return(true); +#endif +} + +/***********************************************************************//** +NOTE! Use the corresponding macro os_file_rename(), not directly this function! +Renames a file (can also move it to another directory). It is safest that the +file is closed before calling this function. +@return TRUE if success */ +UNIV_INTERN +ibool +os_file_rename_func( +/*================*/ + const char* oldpath,/*!< in: old file path as a null-terminated + string */ + const char* newpath)/*!< in: new file path */ +{ +#ifdef UNIV_DEBUG + os_file_type_t type; + ibool exists; + + /* New path must not exist. */ + ut_ad(os_file_status(newpath, &exists, &type)); + ut_ad(!exists); + + /* Old path must exist. */ + ut_ad(os_file_status(oldpath, &exists, &type)); + ut_ad(exists); +#endif /* UNIV_DEBUG */ + +#ifdef __WIN__ + BOOL ret; + + ret = MoveFileEx((LPCTSTR)oldpath, (LPCTSTR)newpath, MOVEFILE_REPLACE_EXISTING); + + if (ret) { + return(TRUE); + } + + os_file_handle_error_no_exit(oldpath, "rename", FALSE, __FILE__, __LINE__); + + return(FALSE); +#else + int ret; + WAIT_ALLOW_WRITES(); + + ret = rename(oldpath, newpath); + + if (ret != 0) { + os_file_handle_error_no_exit(oldpath, "rename", FALSE, __FILE__, __LINE__); + + return(FALSE); + } + + return(TRUE); +#endif /* __WIN__ */ +} + +/***********************************************************************//** +NOTE! Use the corresponding macro os_file_close(), not directly this function! +Closes a file handle. In case of error, error number can be retrieved with +os_file_get_last_error. +@return TRUE if success */ +UNIV_INTERN +ibool +os_file_close_func( +/*===============*/ + os_file_t file) /*!< in, own: handle to a file */ +{ +#ifdef __WIN__ + BOOL ret; + + ut_a(file); + + ret = CloseHandle(file); + + if (ret) { + return(TRUE); + } + + os_file_handle_error(NULL, "close", __FILE__, __LINE__); + + return(FALSE); +#else + int ret; + + ret = close(file); + + if (ret == -1) { + os_file_handle_error(NULL, "close", __FILE__, __LINE__); + + return(FALSE); + } + + return(TRUE); +#endif /* __WIN__ */ +} + +/***********************************************************************//** +Closes a file handle. +@return TRUE if success */ +UNIV_INTERN +ibool +os_file_close_no_error_handling( +/*============================*/ + os_file_t file) /*!< in, own: handle to a file */ +{ +#ifdef __WIN__ + BOOL ret; + + ut_a(file); + + ret = CloseHandle(file); + + if (ret) { + return(TRUE); + } + + return(FALSE); +#else + int ret; + + ret = close(file); + + if (ret == -1) { + + return(FALSE); + } + + return(TRUE); +#endif /* __WIN__ */ +} + +/***********************************************************************//** +Gets a file size. +@return file size, or (os_offset_t) -1 on failure */ +UNIV_INTERN +os_offset_t +os_file_get_size( +/*=============*/ + os_file_t file) /*!< in: handle to a file */ +{ +#ifdef __WIN__ + os_offset_t offset; + DWORD high; + DWORD low; + + low = GetFileSize(file, &high); + + if ((low == 0xFFFFFFFF) && (GetLastError() != NO_ERROR)) { + return((os_offset_t) -1); + } + + offset = (os_offset_t) low | ((os_offset_t) high << 32); + + return(offset); +#else + return((os_offset_t) lseek(file, 0, SEEK_END)); +#endif /* __WIN__ */ +} + +/***********************************************************************//** +Write the specified number of zeros to a newly created file. +@return TRUE if success */ +UNIV_INTERN +ibool +os_file_set_size( +/*=============*/ + const char* name, /*!< in: name of the file or path as a + null-terminated string */ + os_file_t file, /*!< in: handle to a file */ + os_offset_t size) /*!< in: file size */ +{ + os_offset_t current_size; + ibool ret; + byte* buf; + byte* buf2; + ulint buf_size; + + current_size = 0; + +#ifdef HAVE_POSIX_FALLOCATE + if (srv_use_posix_fallocate) { + + if (posix_fallocate(file, current_size, size) == -1) { + + ib_logf(IB_LOG_LEVEL_ERROR, "preallocating file " + "space for file \'%s\' failed. Current size " + INT64PF ", desired size " INT64PF "\n", + name, current_size, size); + os_file_handle_error_no_exit (name, "posix_fallocate", + FALSE, __FILE__, __LINE__); + return(FALSE); + } + return(TRUE); + } +#endif + + /* Write up to 1 megabyte at a time. */ + buf_size = ut_min(64, (ulint) (size / UNIV_PAGE_SIZE)) + * UNIV_PAGE_SIZE; + buf2 = static_cast(ut_malloc(buf_size + UNIV_PAGE_SIZE)); + + /* Align the buffer for possible raw i/o */ + buf = static_cast(ut_align(buf2, UNIV_PAGE_SIZE)); + + /* Write buffer full of zeros */ + memset(buf, 0, buf_size); + + if (size >= (os_offset_t) 100 << 20) { + + fprintf(stderr, "InnoDB: Progress in MB:"); + } + + while (current_size < size) { + ulint n_bytes; + + if (size - current_size < (os_offset_t) buf_size) { + n_bytes = (ulint) (size - current_size); + } else { + n_bytes = buf_size; + } + + ret = os_file_write(name, file, buf, current_size, n_bytes); + + if (!ret) { + ut_free(buf2); + goto error_handling; + } + + /* Print about progress for each 100 MB written */ + if ((current_size + n_bytes) / (100 << 20) + != current_size / (100 << 20)) { + + fprintf(stderr, " %lu00", + (ulong) ((current_size + n_bytes) + / (100 << 20))); + } + + current_size += n_bytes; + } + + if (size >= (os_offset_t) 100 << 20) { + + fprintf(stderr, "\n"); + } + + ut_free(buf2); + + ret = os_file_flush(file); + + if (ret) { + return(TRUE); + } + +error_handling: + return(FALSE); +} + +/***********************************************************************//** +Truncates a file at its current position. +@return TRUE if success */ +UNIV_INTERN +ibool +os_file_set_eof( +/*============*/ + FILE* file) /*!< in: file to be truncated */ +{ +#ifdef __WIN__ + HANDLE h = (HANDLE) _get_osfhandle(fileno(file)); + return(SetEndOfFile(h)); +#else /* __WIN__ */ + WAIT_ALLOW_WRITES(); + return(!ftruncate(fileno(file), ftell(file))); +#endif /* __WIN__ */ +} + +/***********************************************************************//** +Truncates a file at the specified position. +@return TRUE if success */ +UNIV_INTERN +ibool +os_file_set_eof_at( + os_file_t file, /*!< in: handle to a file */ + ib_uint64_t new_len)/*!< in: new file length */ +{ +#ifdef __WIN__ + LARGE_INTEGER li, li2; + li.QuadPart = new_len; + return(SetFilePointerEx(file, li, &li2,FILE_BEGIN) + && SetEndOfFile(file)); +#else + WAIT_ALLOW_WRITES(); + /* TODO: works only with -D_FILE_OFFSET_BITS=64 ? */ + return(!ftruncate(file, new_len)); +#endif +} + + +#ifndef __WIN__ +/***********************************************************************//** +Wrapper to fsync(2) that retries the call on some errors. +Returns the value 0 if successful; otherwise the value -1 is returned and +the global variable errno is set to indicate the error. +@return 0 if success, -1 otherwise */ + +static +int +os_file_fsync( +/*==========*/ + os_file_t file) /*!< in: handle to a file */ +{ + int ret; + int failures; + ibool retry; + + failures = 0; + + do { + ret = fsync(file); + + os_n_fsyncs++; + + if (ret == -1 && errno == ENOLCK) { + + if (failures % 100 == 0) { + + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: fsync(): " + "No locks available; retrying\n"); + } + + os_thread_sleep(200000 /* 0.2 sec */); + + failures++; + + retry = TRUE; + } else if (ret == -1 && errno == EINTR) { + /* Handle signal interruptions correctly */ + retry = TRUE; + } else { + + retry = FALSE; + } + } while (retry); + + return(ret); +} +#endif /* !__WIN__ */ + +/***********************************************************************//** +NOTE! Use the corresponding macro os_file_flush(), not directly this function! +Flushes the write buffers of a given file to the disk. +@return TRUE if success */ +UNIV_INTERN +ibool +os_file_flush_func( +/*===============*/ + os_file_t file) /*!< in, own: handle to a file */ +{ +#ifdef __WIN__ + BOOL ret; + + ut_a(file); + + os_n_fsyncs++; + + ret = FlushFileBuffers(file); + + if (ret) { + return(TRUE); + } + + /* Since Windows returns ERROR_INVALID_FUNCTION if the 'file' is + actually a raw device, we choose to ignore that error if we are using + raw disks */ + + if (srv_start_raw_disk_in_use && GetLastError() + == ERROR_INVALID_FUNCTION) { + return(TRUE); + } + + os_file_handle_error(NULL, "flush", __FILE__, __LINE__); + + /* It is a fatal error if a file flush does not succeed, because then + the database can get corrupt on disk */ + ut_error; + + return(FALSE); +#else + int ret; + WAIT_ALLOW_WRITES(); + +#if defined(HAVE_DARWIN_THREADS) +# ifndef F_FULLFSYNC + /* The following definition is from the Mac OS X 10.3 */ +# define F_FULLFSYNC 51 /* fsync + ask the drive to flush to the media */ +# elif F_FULLFSYNC != 51 +# error "F_FULLFSYNC != 51: ABI incompatibility with Mac OS X 10.3" +# endif + /* Apple has disabled fsync() for internal disk drives in OS X. That + caused corruption for a user when he tested a power outage. Let us in + OS X use a nonstandard flush method recommended by an Apple + engineer. */ + + if (!srv_have_fullfsync) { + /* If we are not on an operating system that supports this, + then fall back to a plain fsync. */ + + ret = os_file_fsync(file); + } else { + ret = fcntl(file, F_FULLFSYNC, NULL); + + if (ret) { + /* If we are not on a file system that supports this, + then fall back to a plain fsync. */ + ret = os_file_fsync(file); + } + } +#else + ret = os_file_fsync(file); +#endif + + if (ret == 0) { + return(TRUE); + } + + /* Since Linux returns EINVAL if the 'file' is actually a raw device, + we choose to ignore that error if we are using raw disks */ + + if (srv_start_raw_disk_in_use && errno == EINVAL) { + + return(TRUE); + } + + ib_logf(IB_LOG_LEVEL_ERROR, "The OS said file flush did not succeed"); + + os_file_handle_error(NULL, "flush", __FILE__, __LINE__); + + /* It is a fatal error if a file flush does not succeed, because then + the database can get corrupt on disk */ + ut_error; + + return(FALSE); +#endif +} + +#ifndef __WIN__ +/*******************************************************************//** +Does a synchronous read operation in Posix. +@return number of bytes read, -1 if error */ +static __attribute__((nonnull(2), warn_unused_result)) +ssize_t +os_file_pread( +/*==========*/ + os_file_t file, /*!< in: handle to a file */ + void* buf, /*!< in: buffer where to read */ + ulint n, /*!< in: number of bytes to read */ + os_offset_t offset, /*!< in: file offset from where to read */ + trx_t* trx) +{ + off_t offs; +#if defined(HAVE_PREAD) && !defined(HAVE_BROKEN_PREAD) + ssize_t n_bytes; + ssize_t n_read; +#endif /* HAVE_PREAD && !HAVE_BROKEN_PREAD */ + ulint sec; + ulint ms; + ib_uint64_t start_time; + ib_uint64_t finish_time; + + ut_ad(n); + + /* If off_t is > 4 bytes in size, then we assume we can pass a + 64-bit address */ + offs = (off_t) offset; + + if (sizeof(off_t) <= 4) { + if (offset != (os_offset_t) offs) { + ib_logf(IB_LOG_LEVEL_ERROR, + "File read at offset > 4 GB"); + } + } + + os_n_file_reads++; + + if (UNIV_UNLIKELY(trx && trx->take_stats)) + { + trx->io_reads++; + trx->io_read += n; + ut_usectime(&sec, &ms); + start_time = (ib_uint64_t)sec * 1000000 + ms; + } else { + start_time = 0; + } +#if defined(HAVE_PREAD) && !defined(HAVE_BROKEN_PREAD) +#if defined(HAVE_ATOMIC_BUILTINS) && UNIV_WORD_SIZE == 8 + (void) os_atomic_increment_ulint(&os_n_pending_reads, 1); + (void) os_atomic_increment_ulint(&os_file_n_pending_preads, 1); + MONITOR_ATOMIC_INC(MONITOR_OS_PENDING_READS); +#else + os_mutex_enter(os_file_count_mutex); + os_file_n_pending_preads++; + os_n_pending_reads++; + MONITOR_INC(MONITOR_OS_PENDING_READS); + os_mutex_exit(os_file_count_mutex); +#endif /* HAVE_ATOMIC_BUILTINS && UNIV_WORD == 8 */ + + /* Handle partial reads and signal interruptions correctly */ + for (n_bytes = 0; n_bytes < (ssize_t) n; ) { + n_read = pread(file, buf, (ssize_t)n - n_bytes, offs); + if (n_read > 0) { + n_bytes += n_read; + offs += n_read; + buf = (char *)buf + n_read; + } else if (n_read == -1 && errno == EINTR) { + continue; + } else { + break; + } + } + +#if defined(HAVE_ATOMIC_BUILTINS) && UNIV_WORD_SIZE == 8 + (void) os_atomic_decrement_ulint(&os_n_pending_reads, 1); + (void) os_atomic_decrement_ulint(&os_file_n_pending_preads, 1); + MONITOR_ATOMIC_DEC(MONITOR_OS_PENDING_READS); +#else + os_mutex_enter(os_file_count_mutex); + os_file_n_pending_preads--; + os_n_pending_reads--; + MONITOR_DEC(MONITOR_OS_PENDING_READS); + os_mutex_exit(os_file_count_mutex); +#endif /* !HAVE_ATOMIC_BUILTINS || UNIV_WORD == 8 */ + + if (UNIV_UNLIKELY(start_time != 0)) + { + ut_usectime(&sec, &ms); + finish_time = (ib_uint64_t)sec * 1000000 + ms; + trx->io_reads_wait_timer += (ulint)(finish_time - start_time); + } + + return(n_bytes); +#else + { + off_t ret_offset; + ssize_t ret; + ssize_t n_read; +#ifndef UNIV_HOTBACKUP + ulint i; +#endif /* !UNIV_HOTBACKUP */ + +#if defined(HAVE_ATOMIC_BUILTINS) && UNIV_WORD_SIZE == 8 + (void) os_atomic_increment_ulint(&os_n_pending_reads, 1); + MONITOR_ATOMIC_INC(MONITOR_OS_PENDING_READS); +#else + os_mutex_enter(os_file_count_mutex); + os_n_pending_reads++; + MONITOR_INC(MONITOR_OS_PENDING_READS); + os_mutex_exit(os_file_count_mutex); +#endif /* HAVE_ATOMIC_BUILTINS && UNIV_WORD == 8 */ +#ifndef UNIV_HOTBACKUP + /* Protect the seek / read operation with a mutex */ + i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES; + + os_mutex_enter(os_file_seek_mutexes[i]); +#endif /* !UNIV_HOTBACKUP */ + + ret_offset = lseek(file, offs, SEEK_SET); + + if (ret_offset < 0) { + ret = -1; + } else { + /* Handle signal interruptions correctly */ + for (ret = 0; ret < (ssize_t) n; ) { + n_read = read(file, buf, (ssize_t)n); + if (n_read > 0) { + ret += n_read; + } else if (n_read == -1 && errno == EINTR) { + continue; + } else { + break; + } + } + } + +#ifndef UNIV_HOTBACKUP + os_mutex_exit(os_file_seek_mutexes[i]); +#endif /* !UNIV_HOTBACKUP */ + +#if defined(HAVE_ATOMIC_BUILTINS) && UNIV_WORD_SIZE == 8 + (void) os_atomic_decrement_ulint(&os_n_pending_reads, 1); + MONITOR_ATOIC_DEC(MONITOR_OS_PENDING_READS); +#else + os_mutex_enter(os_file_count_mutex); + os_n_pending_reads--; + MONITOR_DEC(MONITOR_OS_PENDING_READS); + os_mutex_exit(os_file_count_mutex); +#endif /* HAVE_ATOMIC_BUILTINS && UNIV_WORD_SIZE == 8 */ + + if (UNIV_UNLIKELY(start_time != 0) + { + ut_usectime(&sec, &ms); + finish_time = (ib_uint64_t)sec * 1000000 + ms; + trx->io_reads_wait_timer += (ulint)(finish_time - start_time); + } + + return(ret); + } +#endif +} + +/*******************************************************************//** +Does a synchronous write operation in Posix. +@return number of bytes written, -1 if error */ +static __attribute__((nonnull, warn_unused_result)) +ssize_t +os_file_pwrite( +/*===========*/ + os_file_t file, /*!< in: handle to a file */ + const void* buf, /*!< in: buffer from where to write */ + ulint n, /*!< in: number of bytes to write */ + os_offset_t offset) /*!< in: file offset where to write */ +{ + ssize_t ret; + ssize_t n_written; + off_t offs; + + ut_ad(n); + ut_ad(!srv_read_only_mode); + + /* If off_t is > 4 bytes in size, then we assume we can pass a + 64-bit address */ + offs = (off_t) offset; + + if (sizeof(off_t) <= 4) { + if (offset != (os_offset_t) offs) { + ib_logf(IB_LOG_LEVEL_ERROR, + "File write at offset > 4 GB."); + } + } + + os_n_file_writes++; + +#if defined(HAVE_PWRITE) && !defined(HAVE_BROKEN_PREAD) +#if !defined(HAVE_ATOMIC_BUILTINS) || UNIV_WORD_SIZE < 8 + os_mutex_enter(os_file_count_mutex); + os_file_n_pending_pwrites++; + os_n_pending_writes++; + MONITOR_INC(MONITOR_OS_PENDING_WRITES); + os_mutex_exit(os_file_count_mutex); +#else + (void) os_atomic_increment_ulint(&os_n_pending_writes, 1); + (void) os_atomic_increment_ulint(&os_file_n_pending_pwrites, 1); + MONITOR_ATOMIC_INC(MONITOR_OS_PENDING_WRITES); +#endif /* !HAVE_ATOMIC_BUILTINS || UNIV_WORD < 8 */ + + /* Handle partial writes and signal interruptions correctly */ + for (ret = 0; ret < (ssize_t) n; ) { + n_written = pwrite(file, buf, (ssize_t)n - ret, offs); + if (n_written >= 0) { + ret += n_written; + offs += n_written; + buf = (char *)buf + n_written; + } else if (n_written == -1 && errno == EINTR) { + continue; + } else { + break; + } + } + +#if !defined(HAVE_ATOMIC_BUILTINS) || UNIV_WORD_SIZE < 8 + os_mutex_enter(os_file_count_mutex); + os_file_n_pending_pwrites--; + os_n_pending_writes--; + MONITOR_DEC(MONITOR_OS_PENDING_WRITES); + os_mutex_exit(os_file_count_mutex); +#else + (void) os_atomic_decrement_ulint(&os_n_pending_writes, 1); + (void) os_atomic_decrement_ulint(&os_file_n_pending_pwrites, 1); + MONITOR_ATOMIC_DEC(MONITOR_OS_PENDING_WRITES); +#endif /* !HAVE_ATOMIC_BUILTINS || UNIV_WORD < 8 */ + + return(ret); +#else + { + off_t ret_offset; +# ifndef UNIV_HOTBACKUP + ulint i; +# endif /* !UNIV_HOTBACKUP */ + + os_mutex_enter(os_file_count_mutex); + os_n_pending_writes++; + MONITOR_INC(MONITOR_OS_PENDING_WRITES); + os_mutex_exit(os_file_count_mutex); + +# ifndef UNIV_HOTBACKUP + /* Protect the seek / write operation with a mutex */ + i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES; + + os_mutex_enter(os_file_seek_mutexes[i]); +# endif /* UNIV_HOTBACKUP */ + + ret_offset = lseek(file, offs, SEEK_SET); + + if (ret_offset < 0) { + ret = -1; + + goto func_exit; + } + + /* Handle signal interruptions correctly */ + for (ret = 0; ret < (ssize_t) n; ) { + n_written = write(file, buf, (ssize_t)n); + if (n_written > 0) { + ret += n_written; + } else if (n_written == -1 && errno == EINTR) { + continue; + } else { + break; + } + } + +func_exit: +# ifndef UNIV_HOTBACKUP + os_mutex_exit(os_file_seek_mutexes[i]); +# endif /* !UNIV_HOTBACKUP */ + + os_mutex_enter(os_file_count_mutex); + os_n_pending_writes--; + MONITOR_DEC(MONITOR_OS_PENDING_WRITES); + os_mutex_exit(os_file_count_mutex); + + return(ret); + } +#endif /* !UNIV_HOTBACKUP */ +} +#endif + +/*******************************************************************//** +NOTE! Use the corresponding macro os_file_read(), not directly this +function! +Requests a synchronous positioned read operation. +@return TRUE if request was successful, FALSE if fail */ +UNIV_INTERN +ibool +os_file_read_func( +/*==============*/ + os_file_t file, /*!< in: handle to a file */ + void* buf, /*!< in: buffer where to read */ + os_offset_t offset, /*!< in: file offset where to read */ + ulint n, /*!< in: number of bytes to read */ + trx_t* trx, + ibool compressed) /*!< in: is this file space + compressed ? */ +{ +#ifdef __WIN__ + BOOL ret; + DWORD len; + ibool retry; + OVERLAPPED overlapped; + + + /* On 64-bit Windows, ulint is 64 bits. But offset and n should be + no more than 32 bits. */ + ut_a((n & 0xFFFFFFFFUL) == n); + + os_n_file_reads++; + os_bytes_read_since_printout += n; + +try_again: + ut_ad(file); + ut_ad(buf); + ut_ad(n > 0); + + os_mutex_enter(os_file_count_mutex); + os_n_pending_reads++; + MONITOR_INC(MONITOR_OS_PENDING_READS); + os_mutex_exit(os_file_count_mutex); + + memset (&overlapped, 0, sizeof (overlapped)); + overlapped.Offset = (DWORD)(offset & 0xFFFFFFFF); + overlapped.OffsetHigh = (DWORD)(offset >> 32); + overlapped.hEvent = win_get_syncio_event(); + ret = ReadFile(file, buf, n, NULL, &overlapped); + if (ret) { + ret = GetOverlappedResult(file, &overlapped, (DWORD *)&len, FALSE); + } + else if(GetLastError() == ERROR_IO_PENDING) { + ret = GetOverlappedResult(file, &overlapped, (DWORD *)&len, TRUE); + } + os_mutex_enter(os_file_count_mutex); + os_n_pending_reads--; + MONITOR_DEC(MONITOR_OS_PENDING_READS); + os_mutex_exit(os_file_count_mutex); + + if (ret && len == n) { + /* Note that InnoDB writes files that are not formated + as file spaces and they do not have FIL_PAGE_TYPE + field, thus we must use here information is the actual + file space compressed. */ + if (compressed && fil_page_is_compressed((byte *)buf)) { + fil_decompress_page(NULL, (byte *)buf, len, NULL); + } + + return(TRUE); + } +#else /* __WIN__ */ + ibool retry; + ssize_t ret; + + os_bytes_read_since_printout += n; + +try_again: + ret = os_file_pread(file, buf, n, offset, trx); + + if ((ulint) ret == n) { + if (fil_page_is_encrypted((byte *)buf)) { + fil_decrypt_page(NULL, (byte *)buf, n, NULL); + } + /* Note that InnoDB writes files that are not formated + as file spaces and they do not have FIL_PAGE_TYPE + field, thus we must use here information is the actual + file space compressed. */ + if (compressed && fil_page_is_compressed((byte *)buf)) { + fil_decompress_page(NULL, (byte *)buf, n, NULL); + } + + + return(TRUE); + } + + ib_logf(IB_LOG_LEVEL_ERROR, + "Tried to read "ULINTPF" bytes at offset " UINT64PF". " + "Was only able to read %ld.", n, offset, (lint) ret); +#endif /* __WIN__ */ + retry = os_file_handle_error(NULL, "read", __FILE__, __LINE__); + + if (retry) { + goto try_again; + } + + fprintf(stderr, + "InnoDB: Fatal error: cannot read from file." + " OS error number %lu.\n", +#ifdef __WIN__ + (ulong) GetLastError() +#else + (ulong) errno +#endif /* __WIN__ */ + ); + fflush(stderr); + + ut_error; + + return(FALSE); +} + +/*******************************************************************//** +NOTE! Use the corresponding macro os_file_read_no_error_handling(), +not directly this function! +Requests a synchronous positioned read operation. This function does not do +any error handling. In case of error it returns FALSE. +@return TRUE if request was successful, FALSE if fail */ +UNIV_INTERN +ibool +os_file_read_no_error_handling_func( +/*================================*/ + os_file_t file, /*!< in: handle to a file */ + void* buf, /*!< in: buffer where to read */ + os_offset_t offset, /*!< in: file offset where to read */ + ulint n, /*!< in: number of bytes to read */ + ibool compressed) /*!< in: is this file space + compressed ? */ +{ +#ifdef __WIN__ + BOOL ret; + DWORD len; + ibool retry; + OVERLAPPED overlapped; + overlapped.Offset = (DWORD)(offset & 0xFFFFFFFF); + overlapped.OffsetHigh = (DWORD)(offset >> 32); + + + /* On 64-bit Windows, ulint is 64 bits. But offset and n should be + no more than 32 bits. */ + ut_a((n & 0xFFFFFFFFUL) == n); + + os_n_file_reads++; + os_bytes_read_since_printout += n; + +try_again: + ut_ad(file); + ut_ad(buf); + ut_ad(n > 0); + + os_mutex_enter(os_file_count_mutex); + os_n_pending_reads++; + MONITOR_INC(MONITOR_OS_PENDING_READS); + os_mutex_exit(os_file_count_mutex); + + memset (&overlapped, 0, sizeof (overlapped)); + overlapped.Offset = (DWORD)(offset & 0xFFFFFFFF); + overlapped.OffsetHigh = (DWORD)(offset >> 32); + overlapped.hEvent = win_get_syncio_event(); + ret = ReadFile(file, buf, n, NULL, &overlapped); + if (ret) { + ret = GetOverlappedResult(file, &overlapped, (DWORD *)&len, FALSE); + } + else if(GetLastError() == ERROR_IO_PENDING) { + ret = GetOverlappedResult(file, &overlapped, (DWORD *)&len, TRUE); + } + os_mutex_enter(os_file_count_mutex); + os_n_pending_reads--; + MONITOR_DEC(MONITOR_OS_PENDING_READS); + os_mutex_exit(os_file_count_mutex); + + if (ret && len == n) { + + /* Note that InnoDB writes files that are not formated + as file spaces and they do not have FIL_PAGE_TYPE + field, thus we must use here information is the actual + file space compressed. */ + if (compressed && fil_page_is_compressed((byte *)buf)) { + fil_decompress_page(NULL, (byte *)buf, n, NULL); + } + + return(TRUE); + } +#else /* __WIN__ */ + ibool retry; + ssize_t ret; + + os_bytes_read_since_printout += n; + +try_again: + ret = os_file_pread(file, buf, n, offset, NULL); + + if ((ulint) ret == n) { + + /* Note that InnoDB writes files that are not formated + as file spaces and they do not have FIL_PAGE_TYPE + field, thus we must use here information is the actual + file space compressed. */ + if (compressed && fil_page_is_compressed((byte *)buf)) { + fil_decompress_page(NULL, (byte *)buf, n, NULL); + } + + if (fil_page_is_encrypted((byte *)buf)) { + fil_decrypt_page(NULL, (byte *)buf, n, NULL); + } + + + + return(TRUE); + } +#endif /* __WIN__ */ + retry = os_file_handle_error_no_exit(NULL, "read", FALSE, __FILE__, __LINE__); + + if (retry) { + goto try_again; + } + + return(FALSE); +} + +/*******************************************************************//** +Rewind file to its start, read at most size - 1 bytes from it to str, and +NUL-terminate str. All errors are silently ignored. This function is +mostly meant to be used with temporary files. */ +UNIV_INTERN +void +os_file_read_string( +/*================*/ + FILE* file, /*!< in: file to read from */ + char* str, /*!< in: buffer where to read */ + ulint size) /*!< in: size of buffer */ +{ + size_t flen; + + if (size == 0) { + return; + } + + rewind(file); + flen = fread(str, 1, size - 1, file); + str[flen] = '\0'; +} + +/*******************************************************************//** +NOTE! Use the corresponding macro os_file_write(), not directly +this function! +Requests a synchronous write operation. +@return TRUE if request was successful, FALSE if fail */ +UNIV_INTERN +ibool +os_file_write_func( +/*===============*/ + const char* name, /*!< in: name of the file or path as a + null-terminated string */ + os_file_t file, /*!< in: handle to a file */ + const void* buf, /*!< in: buffer from which to write */ + os_offset_t offset, /*!< in: file offset where to write */ + ulint n) /*!< in: number of bytes to write */ +{ + ut_ad(!srv_read_only_mode); + +#ifdef __WIN__ + BOOL ret; + DWORD len; + ulint n_retries = 0; + ulint err; + OVERLAPPED overlapped; + + /* On 64-bit Windows, ulint is 64 bits. But offset and n should be + no more than 32 bits. */ + ut_a((n & 0xFFFFFFFFUL) == n); + + os_n_file_writes++; + + ut_ad(file); + ut_ad(buf); + ut_ad(n > 0); + +retry: + + os_mutex_enter(os_file_count_mutex); + os_n_pending_writes++; + MONITOR_INC(MONITOR_OS_PENDING_WRITES); + os_mutex_exit(os_file_count_mutex); + + memset (&overlapped, 0, sizeof (overlapped)); + overlapped.Offset = (DWORD)(offset & 0xFFFFFFFF); + overlapped.OffsetHigh = (DWORD)(offset >> 32); + + overlapped.hEvent = win_get_syncio_event(); + ret = WriteFile(file, buf, n, NULL, &overlapped); + if (ret) { + ret = GetOverlappedResult(file, &overlapped, (DWORD *)&len, FALSE); + } + else if(GetLastError() == ERROR_IO_PENDING) { + ret = GetOverlappedResult(file, &overlapped, (DWORD *)&len, TRUE); + } + + os_mutex_enter(os_file_count_mutex); + os_n_pending_writes--; + MONITOR_DEC(MONITOR_OS_PENDING_WRITES); + os_mutex_exit(os_file_count_mutex); + + if (ret && len == n) { + + return(TRUE); + } + + /* If some background file system backup tool is running, then, at + least in Windows 2000, we may get here a specific error. Let us + retry the operation 100 times, with 1 second waits. */ + + if (GetLastError() == ERROR_LOCK_VIOLATION && n_retries < 100) { + + os_thread_sleep(1000000); + + n_retries++; + + goto retry; + } + + if (!os_has_said_disk_full) { + + err = (ulint) GetLastError(); + + ut_print_timestamp(stderr); + + fprintf(stderr, + " InnoDB: Error: Write to file %s failed" + " at offset %llu.\n" + "InnoDB: %lu bytes should have been written," + " only %lu were written.\n" + "InnoDB: Operating system error number %lu.\n" + "InnoDB: Check that your OS and file system" + " support files of this size.\n" + "InnoDB: Check also that the disk is not full" + " or a disk quota exceeded.\n", + name, offset, + (ulong) n, (ulong) len, (ulong) err); + + if (strerror((int) err) != NULL) { + fprintf(stderr, + "InnoDB: Error number %lu means '%s'.\n", + (ulong) err, strerror((int) err)); + } + + fprintf(stderr, + "InnoDB: Some operating system error numbers" + " are described at\n" + "InnoDB: " + REFMAN "operating-system-error-codes.html\n"); + + os_has_said_disk_full = TRUE; + } + + return(FALSE); +#else + ssize_t ret; + WAIT_ALLOW_WRITES(); + + ret = os_file_pwrite(file, buf, n, offset); + + if ((ulint) ret == n) { + + return(TRUE); + } + + if (!os_has_said_disk_full) { + + ut_print_timestamp(stderr); + + fprintf(stderr, + " InnoDB: Error: Write to file %s failed" + " at offset "UINT64PF".\n" + "InnoDB: %lu bytes should have been written," + " only %ld were written.\n" + "InnoDB: Operating system error number %lu.\n" + "InnoDB: Check that your OS and file system" + " support files of this size.\n" + "InnoDB: Check also that the disk is not full" + " or a disk quota exceeded.\n", + name, offset, n, (lint) ret, + (ulint) errno); + if (strerror(errno) != NULL) { + fprintf(stderr, + "InnoDB: Error number %d means '%s'.\n", + errno, strerror(errno)); + } + + fprintf(stderr, + "InnoDB: Some operating system error numbers" + " are described at\n" + "InnoDB: " + REFMAN "operating-system-error-codes.html\n"); + + os_has_said_disk_full = TRUE; + } + + return(FALSE); +#endif +} + +/*******************************************************************//** +Check the existence and type of the given file. +@return TRUE if call succeeded */ +UNIV_INTERN +ibool +os_file_status( +/*===========*/ + const char* path, /*!< in: pathname of the file */ + ibool* exists, /*!< out: TRUE if file exists */ + os_file_type_t* type) /*!< out: type of the file (if it exists) */ +{ +#ifdef __WIN__ + int ret; + struct _stat64 statinfo; + + ret = _stat64(path, &statinfo); + if (ret && (errno == ENOENT || errno == ENOTDIR)) { + /* file does not exist */ + *exists = FALSE; + return(TRUE); + } else if (ret) { + /* file exists, but stat call failed */ + + os_file_handle_error_no_exit(path, "stat", FALSE, __FILE__, __LINE__); + + return(FALSE); + } + + if (_S_IFDIR & statinfo.st_mode) { + *type = OS_FILE_TYPE_DIR; + } else if (_S_IFREG & statinfo.st_mode) { + *type = OS_FILE_TYPE_FILE; + } else { + *type = OS_FILE_TYPE_UNKNOWN; + } + + *exists = TRUE; + + return(TRUE); +#else + int ret; + struct stat statinfo; + + ret = stat(path, &statinfo); + if (ret && (errno == ENOENT || errno == ENOTDIR)) { + /* file does not exist */ + *exists = FALSE; + return(TRUE); + } else if (ret) { + /* file exists, but stat call failed */ + + os_file_handle_error_no_exit(path, "stat", FALSE, __FILE__, __LINE__); + + return(FALSE); + } + + if (S_ISDIR(statinfo.st_mode)) { + *type = OS_FILE_TYPE_DIR; + } else if (S_ISLNK(statinfo.st_mode)) { + *type = OS_FILE_TYPE_LINK; + } else if (S_ISREG(statinfo.st_mode)) { + *type = OS_FILE_TYPE_FILE; + } else { + *type = OS_FILE_TYPE_UNKNOWN; + } + + *exists = TRUE; + + return(TRUE); +#endif +} + +/*******************************************************************//** +This function returns information about the specified file +@return DB_SUCCESS if all OK */ +UNIV_INTERN +dberr_t +os_file_get_status( +/*===============*/ + const char* path, /*!< in: pathname of the file */ + os_file_stat_t* stat_info, /*!< information of a file in a + directory */ + bool check_rw_perm) /*!< in: for testing whether the + file can be opened in RW mode */ +{ + int ret; + +#ifdef __WIN__ + struct _stat64 statinfo; + + ret = _stat64(path, &statinfo); + + if (ret && (errno == ENOENT || errno == ENOTDIR)) { + /* file does not exist */ + + return(DB_NOT_FOUND); + + } else if (ret) { + /* file exists, but stat call failed */ + + os_file_handle_error_no_exit(path, "stat", FALSE, __FILE__, __LINE__); + + return(DB_FAIL); + + } else if (_S_IFDIR & statinfo.st_mode) { + stat_info->type = OS_FILE_TYPE_DIR; + } else if (_S_IFREG & statinfo.st_mode) { + + DWORD access = GENERIC_READ; + + if (!srv_read_only_mode) { + access |= GENERIC_WRITE; + } + + stat_info->type = OS_FILE_TYPE_FILE; + + /* Check if we can open it in read-only mode. */ + + if (check_rw_perm) { + HANDLE fh; + + fh = CreateFile( + (LPCTSTR) path, // File to open + access, + 0, // No sharing + NULL, // Default security + OPEN_EXISTING, // Existing file only + FILE_ATTRIBUTE_NORMAL, // Normal file + NULL); // No attr. template + + if (fh == INVALID_HANDLE_VALUE) { + stat_info->rw_perm = false; + } else { + stat_info->rw_perm = true; + CloseHandle(fh); + } + } + } else { + stat_info->type = OS_FILE_TYPE_UNKNOWN; + } +#else + struct stat statinfo; + + ret = stat(path, &statinfo); + + if (ret && (errno == ENOENT || errno == ENOTDIR)) { + /* file does not exist */ + + return(DB_NOT_FOUND); + + } else if (ret) { + /* file exists, but stat call failed */ + + os_file_handle_error_no_exit(path, "stat", FALSE, __FILE__, __LINE__); + + return(DB_FAIL); + + } + + switch (statinfo.st_mode & S_IFMT) { + case S_IFDIR: + stat_info->type = OS_FILE_TYPE_DIR; + break; + case S_IFLNK: + stat_info->type = OS_FILE_TYPE_LINK; + break; + case S_IFBLK: + stat_info->type = OS_FILE_TYPE_BLOCK; + break; + case S_IFREG: + stat_info->type = OS_FILE_TYPE_FILE; + break; + default: + stat_info->type = OS_FILE_TYPE_UNKNOWN; + } + + + if (check_rw_perm && (stat_info->type == OS_FILE_TYPE_FILE + || stat_info->type == OS_FILE_TYPE_BLOCK)) { + int fh; + int access; + + access = !srv_read_only_mode ? O_RDWR : O_RDONLY; + + fh = ::open(path, access, os_innodb_umask); + + if (fh == -1) { + stat_info->rw_perm = false; + } else { + stat_info->rw_perm = true; + close(fh); + } + } + +#endif /* _WIN_ */ + + stat_info->ctime = statinfo.st_ctime; + stat_info->atime = statinfo.st_atime; + stat_info->mtime = statinfo.st_mtime; + stat_info->size = statinfo.st_size; + + return(DB_SUCCESS); +} + +/* path name separator character */ +#ifdef __WIN__ +# define OS_FILE_PATH_SEPARATOR '\\' +#else +# define OS_FILE_PATH_SEPARATOR '/' +#endif + +/****************************************************************//** +This function returns a new path name after replacing the basename +in an old path with a new basename. The old_path is a full path +name including the extension. The tablename is in the normal +form "databasename/tablename". The new base name is found after +the forward slash. Both input strings are null terminated. + +This function allocates memory to be returned. It is the callers +responsibility to free the return value after it is no longer needed. + +@return own: new full pathname */ +UNIV_INTERN +char* +os_file_make_new_pathname( +/*======================*/ + const char* old_path, /*!< in: pathname */ + const char* tablename) /*!< in: contains new base name */ +{ + ulint dir_len; + char* last_slash; + char* base_name; + char* new_path; + ulint new_path_len; + + /* Split the tablename into its database and table name components. + They are separated by a '/'. */ + last_slash = strrchr((char*) tablename, '/'); + base_name = last_slash ? last_slash + 1 : (char*) tablename; + + /* Find the offset of the last slash. We will strip off the + old basename.ibd which starts after that slash. */ + last_slash = strrchr((char*) old_path, OS_FILE_PATH_SEPARATOR); + dir_len = last_slash ? last_slash - old_path : strlen(old_path); + + /* allocate a new path and move the old directory path to it. */ + new_path_len = dir_len + strlen(base_name) + sizeof "/.ibd"; + new_path = static_cast(mem_alloc(new_path_len)); + memcpy(new_path, old_path, dir_len); + + ut_snprintf(new_path + dir_len, + new_path_len - dir_len, + "%c%s.ibd", + OS_FILE_PATH_SEPARATOR, + base_name); + + return(new_path); +} + +/****************************************************************//** +This function returns a remote path name by combining a data directory +path provided in a DATA DIRECTORY clause with the tablename which is +in the form 'database/tablename'. It strips the file basename (which +is the tablename) found after the last directory in the path provided. +The full filepath created will include the database name as a directory +under the path provided. The filename is the tablename with the '.ibd' +extension. All input and output strings are null-terminated. + +This function allocates memory to be returned. It is the callers +responsibility to free the return value after it is no longer needed. + +@return own: A full pathname; data_dir_path/databasename/tablename.ibd */ +UNIV_INTERN +char* +os_file_make_remote_pathname( +/*=========================*/ + const char* data_dir_path, /*!< in: pathname */ + const char* tablename, /*!< in: tablename */ + const char* extention) /*!< in: file extention; ibd,cfg */ +{ + ulint data_dir_len; + char* last_slash; + char* new_path; + ulint new_path_len; + + ut_ad(extention && strlen(extention) == 3); + + /* Find the offset of the last slash. We will strip off the + old basename or tablename which starts after that slash. */ + last_slash = strrchr((char*) data_dir_path, OS_FILE_PATH_SEPARATOR); + data_dir_len = last_slash ? last_slash - data_dir_path : strlen(data_dir_path); + + /* allocate a new path and move the old directory path to it. */ + new_path_len = data_dir_len + strlen(tablename) + + sizeof "/." + strlen(extention); + new_path = static_cast(mem_alloc(new_path_len)); + memcpy(new_path, data_dir_path, data_dir_len); + ut_snprintf(new_path + data_dir_len, + new_path_len - data_dir_len, + "%c%s.%s", + OS_FILE_PATH_SEPARATOR, + tablename, + extention); + + srv_normalize_path_for_win(new_path); + + return(new_path); +} + +/****************************************************************//** +This function reduces a null-terminated full remote path name into +the path that is sent by MySQL for DATA DIRECTORY clause. It replaces +the 'databasename/tablename.ibd' found at the end of the path with just +'tablename'. + +Since the result is always smaller than the path sent in, no new memory +is allocated. The caller should allocate memory for the path sent in. +This function manipulates that path in place. + +If the path format is not as expected, just return. The result is used +to inform a SHOW CREATE TABLE command. */ +UNIV_INTERN +void +os_file_make_data_dir_path( +/*========================*/ + char* data_dir_path) /*!< in/out: full path/data_dir_path */ +{ + char* ptr; + char* tablename; + ulint tablename_len; + + /* Replace the period before the extension with a null byte. */ + ptr = strrchr((char*) data_dir_path, '.'); + if (!ptr) { + return; + } + ptr[0] = '\0'; + + /* The tablename starts after the last slash. */ + ptr = strrchr((char*) data_dir_path, OS_FILE_PATH_SEPARATOR); + if (!ptr) { + return; + } + ptr[0] = '\0'; + tablename = ptr + 1; + + /* The databasename starts after the next to last slash. */ + ptr = strrchr((char*) data_dir_path, OS_FILE_PATH_SEPARATOR); + if (!ptr) { + return; + } + tablename_len = ut_strlen(tablename); + + ut_memmove(++ptr, tablename, tablename_len); + + ptr[tablename_len] = '\0'; +} + +/****************************************************************//** +The function os_file_dirname returns a directory component of a +null-terminated pathname string. In the usual case, dirname returns +the string up to, but not including, the final '/', and basename +is the component following the final '/'. Trailing '/' characters +are not counted as part of the pathname. + +If path does not contain a slash, dirname returns the string ".". + +Concatenating the string returned by dirname, a "/", and the basename +yields a complete pathname. + +The return value is a copy of the directory component of the pathname. +The copy is allocated from heap. It is the caller responsibility +to free it after it is no longer needed. + +The following list of examples (taken from SUSv2) shows the strings +returned by dirname and basename for different paths: + + path dirname basename + "/usr/lib" "/usr" "lib" + "/usr/" "/" "usr" + "usr" "." "usr" + "/" "/" "/" + "." "." "." + ".." "." ".." + +@return own: directory component of the pathname */ +UNIV_INTERN +char* +os_file_dirname( +/*============*/ + const char* path) /*!< in: pathname */ +{ + /* Find the offset of the last slash */ + const char* last_slash = strrchr(path, OS_FILE_PATH_SEPARATOR); + if (!last_slash) { + /* No slash in the path, return "." */ + + return(mem_strdup(".")); + } + + /* Ok, there is a slash */ + + if (last_slash == path) { + /* last slash is the first char of the path */ + + return(mem_strdup("/")); + } + + /* Non-trivial directory component */ + + return(mem_strdupl(path, last_slash - path)); +} + +/****************************************************************//** +Creates all missing subdirectories along the given path. +@return TRUE if call succeeded FALSE otherwise */ +UNIV_INTERN +ibool +os_file_create_subdirs_if_needed( +/*=============================*/ + const char* path) /*!< in: path name */ +{ + if (srv_read_only_mode) { + + ib_logf(IB_LOG_LEVEL_ERROR, + "read only mode set. Can't create subdirectories '%s'", + path); + + return(FALSE); + + } + + char* subdir = os_file_dirname(path); + + if (strlen(subdir) == 1 + && (*subdir == OS_FILE_PATH_SEPARATOR || *subdir == '.')) { + /* subdir is root or cwd, nothing to do */ + mem_free(subdir); + + return(TRUE); + } + + /* Test if subdir exists */ + os_file_type_t type; + ibool subdir_exists; + ibool success = os_file_status(subdir, &subdir_exists, &type); + + if (success && !subdir_exists) { + + /* subdir does not exist, create it */ + success = os_file_create_subdirs_if_needed(subdir); + + if (!success) { + mem_free(subdir); + + return(FALSE); + } + + success = os_file_create_directory(subdir, FALSE); + } + + mem_free(subdir); + + return(success); +} + +#ifndef UNIV_HOTBACKUP +/****************************************************************//** +Returns a pointer to the nth slot in the aio array. +@return pointer to slot */ +static +os_aio_slot_t* +os_aio_array_get_nth_slot( +/*======================*/ + os_aio_array_t* array, /*!< in: aio array */ + ulint index) /*!< in: index of the slot */ +{ + ut_a(index < array->n_slots); + + return(&array->slots[index]); +} + +#if defined(LINUX_NATIVE_AIO) +/******************************************************************//** +Creates an io_context for native linux AIO. +@return TRUE on success. */ +static +ibool +os_aio_linux_create_io_ctx( +/*=======================*/ + ulint max_events, /*!< in: number of events. */ + io_context_t* io_ctx) /*!< out: io_ctx to initialize. */ +{ + int ret; + ulint retries = 0; + +retry: + memset(io_ctx, 0x0, sizeof(*io_ctx)); + + /* Initialize the io_ctx. Tell it how many pending + IO requests this context will handle. */ + + ret = io_setup(max_events, io_ctx); + if (ret == 0) { +#if defined(UNIV_AIO_DEBUG) + fprintf(stderr, + "InnoDB: Linux native AIO:" + " initialized io_ctx for segment\n"); +#endif + /* Success. Return now. */ + return(TRUE); + } + + /* If we hit EAGAIN we'll make a few attempts before failing. */ + + switch (ret) { + case -EAGAIN: + if (retries == 0) { + /* First time around. */ + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Warning: io_setup() failed" + " with EAGAIN. Will make %d attempts" + " before giving up.\n", + OS_AIO_IO_SETUP_RETRY_ATTEMPTS); + } + + if (retries < OS_AIO_IO_SETUP_RETRY_ATTEMPTS) { + ++retries; + fprintf(stderr, + "InnoDB: Warning: io_setup() attempt" + " %lu failed.\n", + retries); + os_thread_sleep(OS_AIO_IO_SETUP_RETRY_SLEEP); + goto retry; + } + + /* Have tried enough. Better call it a day. */ + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Error: io_setup() failed" + " with EAGAIN after %d attempts.\n", + OS_AIO_IO_SETUP_RETRY_ATTEMPTS); + break; + + case -ENOSYS: + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Error: Linux Native AIO interface" + " is not supported on this platform. Please" + " check your OS documentation and install" + " appropriate binary of InnoDB.\n"); + + break; + + default: + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Error: Linux Native AIO setup" + " returned following error[%d]\n", -ret); + break; + } + + fprintf(stderr, + "InnoDB: You can disable Linux Native AIO by" + " setting innodb_use_native_aio = 0 in my.cnf\n"); + return(FALSE); +} + +/******************************************************************//** +Checks if the system supports native linux aio. On some kernel +versions where native aio is supported it won't work on tmpfs. In such +cases we can't use native aio as it is not possible to mix simulated +and native aio. +@return: TRUE if supported, FALSE otherwise. */ +static +ibool +os_aio_native_aio_supported(void) +/*=============================*/ +{ + int fd; + io_context_t io_ctx; + char name[1000]; + + if (!os_aio_linux_create_io_ctx(1, &io_ctx)) { + /* The platform does not support native aio. */ + return(FALSE); + } else if (!srv_read_only_mode) { + /* Now check if tmpdir supports native aio ops. */ + fd = innobase_mysql_tmpfile(); + + if (fd < 0) { + ib_logf(IB_LOG_LEVEL_WARN, + "Unable to create temp file to check " + "native AIO support."); + + return(FALSE); + } + } else { + + srv_normalize_path_for_win(srv_log_group_home_dir); + + ulint dirnamelen = strlen(srv_log_group_home_dir); + ut_a(dirnamelen < (sizeof name) - 10 - sizeof "ib_logfile"); + memcpy(name, srv_log_group_home_dir, dirnamelen); + + /* Add a path separator if needed. */ + if (dirnamelen && name[dirnamelen - 1] != SRV_PATH_SEPARATOR) { + name[dirnamelen++] = SRV_PATH_SEPARATOR; + } + + strcpy(name + dirnamelen, "ib_logfile0"); + + fd = ::open(name, O_RDONLY); + + if (fd == -1) { + + ib_logf(IB_LOG_LEVEL_WARN, + "Unable to open \"%s\" to check " + "native AIO read support.", name); + + return(FALSE); + } + } + + struct io_event io_event; + + memset(&io_event, 0x0, sizeof(io_event)); + + byte* buf = static_cast(ut_malloc(UNIV_PAGE_SIZE * 2)); + byte* ptr = static_cast(ut_align(buf, UNIV_PAGE_SIZE)); + + struct iocb iocb; + + /* Suppress valgrind warning. */ + memset(buf, 0x00, UNIV_PAGE_SIZE * 2); + memset(&iocb, 0x0, sizeof(iocb)); + + struct iocb* p_iocb = &iocb; + + if (!srv_read_only_mode) { + io_prep_pwrite(p_iocb, fd, ptr, UNIV_PAGE_SIZE, 0); + } else { + ut_a(UNIV_PAGE_SIZE >= 512); + io_prep_pread(p_iocb, fd, ptr, 512, 0); + } + + int err = io_submit(io_ctx, 1, &p_iocb); + + if (err >= 1) { + /* Now collect the submitted IO request. */ + err = io_getevents(io_ctx, 1, 1, &io_event, NULL); + } + + ut_free(buf); + close(fd); + + switch (err) { + case 1: + return(TRUE); + + case -EINVAL: + case -ENOSYS: + ib_logf(IB_LOG_LEVEL_ERROR, + "Linux Native AIO not supported. You can either " + "move %s to a file system that supports native " + "AIO or you can set innodb_use_native_aio to " + "FALSE to avoid this message.", + srv_read_only_mode ? name : "tmpdir"); + + /* fall through. */ + default: + ib_logf(IB_LOG_LEVEL_ERROR, + "Linux Native AIO check on %s returned error[%d]", + srv_read_only_mode ? name : "tmpdir", -err); + } + + return(FALSE); +} +#endif /* LINUX_NATIVE_AIO */ + +/******************************************************************//** +Creates an aio wait array. Note that we return NULL in case of failure. +We don't care about freeing memory here because we assume that a +failure will result in server refusing to start up. +@return own: aio array, NULL on failure */ +static +os_aio_array_t* +os_aio_array_create( +/*================*/ + ulint n, /*!< in: maximum number of pending aio + operations allowed; n must be + divisible by n_segments */ + ulint n_segments) /*!< in: number of segments in the aio array */ +{ + os_aio_array_t* array; +#ifdef LINUX_NATIVE_AIO + struct io_event* io_event = NULL; +#endif + ut_a(n > 0); + ut_a(n_segments > 0); + + array = static_cast(ut_malloc(sizeof(*array))); + memset(array, 0x0, sizeof(*array)); + + array->mutex = os_mutex_create(); + array->not_full = os_event_create(); + array->is_empty = os_event_create(); + + os_event_set(array->is_empty); + + array->n_slots = n; + array->n_segments = n_segments; + + array->slots = static_cast( + ut_malloc(n * sizeof(*array->slots))); + + memset(array->slots, 0x0, n * sizeof(*array->slots)); + +#if defined(LINUX_NATIVE_AIO) + array->aio_ctx = NULL; + array->aio_events = NULL; + + /* If we are not using native aio interface then skip this + part of initialization. */ + if (!srv_use_native_aio) { + goto skip_native_aio; + } + + /* Initialize the io_context array. One io_context + per segment in the array. */ + + array->aio_ctx = static_cast( + ut_malloc(n_segments * sizeof(*array->aio_ctx))); + + for (ulint i = 0; i < n_segments; ++i) { + if (!os_aio_linux_create_io_ctx(n/n_segments, + &array->aio_ctx[i])) { + /* If something bad happened during aio setup + we disable linux native aio. + The disadvantage will be a small memory leak + at shutdown but that's ok compared to a crash + or a not working server. + This frequently happens when running the test suite + with many threads on a system with low fs.aio-max-nr! + */ + + fprintf(stderr, + " InnoDB: Warning: Linux Native AIO disabled " + "because os_aio_linux_create_io_ctx() " + "failed. To get rid of this warning you can " + "try increasing system " + "fs.aio-max-nr to 1048576 or larger or " + "setting innodb_use_native_aio = 0 in my.cnf\n"); + srv_use_native_aio = FALSE; + goto skip_native_aio; + } + } + + /* Initialize the event array. One event per slot. */ + io_event = static_cast( + ut_malloc(n * sizeof(*io_event))); + + memset(io_event, 0x0, sizeof(*io_event) * n); + array->aio_events = io_event; + +skip_native_aio: +#endif /* LINUX_NATIVE_AIO */ + for (ulint i = 0; i < n; i++) { + os_aio_slot_t* slot; + + slot = os_aio_array_get_nth_slot(array, i); + slot->pos = i; + slot->reserved = FALSE; +#ifdef LINUX_NATIVE_AIO + memset(&slot->control, 0x0, sizeof(slot->control)); + slot->n_bytes = 0; + slot->ret = 0; +#endif /* WIN_ASYNC_IO */ + } + + return(array); +} + +/************************************************************************//** +Frees an aio wait array. */ +static +void +os_aio_array_free( +/*==============*/ + os_aio_array_t*& array) /*!< in, own: array to free */ +{ + ulint i; + + os_mutex_free(array->mutex); + os_event_free(array->not_full); + os_event_free(array->is_empty); + +#if defined(LINUX_NATIVE_AIO) + if (srv_use_native_aio) { + ut_free(array->aio_events); + ut_free(array->aio_ctx); + } +#endif /* LINUX_NATIVE_AIO */ + + for (i = 0; i < array->n_slots; i++) { + os_aio_slot_t* slot = os_aio_array_get_nth_slot(array, i); + if (slot->page_compression_page) { + ut_free(slot->page_compression_page); + slot->page_compression_page = NULL; + } + + if (slot->lzo_mem) { + ut_free(slot->lzo_mem); + slot->lzo_mem = NULL; + } + } + + for (i = 0; i < array->n_slots; i++) { + os_aio_slot_t* slot = os_aio_array_get_nth_slot(array, i); + if (slot->page_encryption_page) { + ut_free(slot->page_encryption_page); + slot->page_encryption_page = NULL; + } + } + + + ut_free(array->slots); + ut_free(array); + + array = 0; +} + +/*********************************************************************** +Initializes the asynchronous io system. Creates one array each for ibuf +and log i/o. Also creates one array each for read and write where each +array is divided logically into n_read_segs and n_write_segs +respectively. The caller must create an i/o handler thread for each +segment in these arrays. This function also creates the sync array. +No i/o handler thread needs to be created for that */ +UNIV_INTERN +ibool +os_aio_init( +/*========*/ + ulint n_per_seg, /*= 4); + } else { + ut_ad(n_segments > 0); + } + + os_aio_sync_array = os_aio_array_create(n_slots_sync, 1); + + if (os_aio_sync_array == NULL) { + return(FALSE); + } + + os_aio_n_segments = n_segments; + + os_aio_validate(); + + os_aio_segment_wait_events = static_cast( + ut_malloc(n_segments * sizeof *os_aio_segment_wait_events)); + + for (ulint i = 0; i < n_segments; ++i) { + os_aio_segment_wait_events[i] = os_event_create(); + } + + os_last_printout = ut_time(); + +#ifdef _WIN32 + ut_a(completion_port == 0 && read_completion_port == 0); + completion_port = CreateIoCompletionPort(INVALID_HANDLE_VALUE, NULL, 0, 0); + read_completion_port = CreateIoCompletionPort(INVALID_HANDLE_VALUE, NULL, 0, 0); + ut_a(completion_port && read_completion_port); +#endif + + return(TRUE); + +} + +/*********************************************************************** +Frees the asynchronous io system. */ +UNIV_INTERN +void +os_aio_free(void) +/*=============*/ +{ + if (os_aio_ibuf_array != 0) { + os_aio_array_free(os_aio_ibuf_array); + } + + if (os_aio_log_array != 0) { + os_aio_array_free(os_aio_log_array); + } + + if (os_aio_write_array != 0) { + os_aio_array_free(os_aio_write_array); + } + + if (os_aio_sync_array != 0) { + os_aio_array_free(os_aio_sync_array); + } + + os_aio_array_free(os_aio_read_array); + + for (ulint i = 0; i < os_aio_n_segments; i++) { + os_event_free(os_aio_segment_wait_events[i]); + } + + ut_free(os_aio_segment_wait_events); + os_aio_segment_wait_events = 0; + os_aio_n_segments = 0; +#ifdef _WIN32 + completion_port = 0; + read_completion_port = 0; +#endif +} + +#ifdef WIN_ASYNC_IO +/************************************************************************//** +Wakes up all async i/o threads in the array in Windows async i/o at +shutdown. */ +static +void +os_aio_array_wake_win_aio_at_shutdown( +/*==================================*/ + os_aio_array_t* array) /*!< in: aio array */ +{ + if(completion_port) + { + PostQueuedCompletionStatus(completion_port, 0, IOCP_SHUTDOWN_KEY, NULL); + PostQueuedCompletionStatus(read_completion_port, 0, IOCP_SHUTDOWN_KEY, NULL); + } +} +#endif + +/************************************************************************//** +Wakes up all async i/o threads so that they know to exit themselves in +shutdown. */ +UNIV_INTERN +void +os_aio_wake_all_threads_at_shutdown(void) +/*=====================================*/ +{ +#ifdef WIN_ASYNC_IO + /* This code wakes up all ai/o threads in Windows native aio */ + os_aio_array_wake_win_aio_at_shutdown(os_aio_read_array); + if (os_aio_write_array != 0) { + os_aio_array_wake_win_aio_at_shutdown(os_aio_write_array); + } + + if (os_aio_ibuf_array != 0) { + os_aio_array_wake_win_aio_at_shutdown(os_aio_ibuf_array); + } + + if (os_aio_log_array != 0) { + os_aio_array_wake_win_aio_at_shutdown(os_aio_log_array); + } + +#elif defined(LINUX_NATIVE_AIO) + + /* When using native AIO interface the io helper threads + wait on io_getevents with a timeout value of 500ms. At + each wake up these threads check the server status. + No need to do anything to wake them up. */ + + if (srv_use_native_aio) { + return; + } + + /* Fall through to simulated AIO handler wakeup if we are + not using native AIO. */ +#endif /* !WIN_ASYNC_AIO */ + + /* This loop wakes up all simulated ai/o threads */ + + for (ulint i = 0; i < os_aio_n_segments; i++) { + + os_event_set(os_aio_segment_wait_events[i]); + } +} + +/************************************************************************//** +Waits until there are no pending writes in os_aio_write_array. There can +be other, synchronous, pending writes. */ +UNIV_INTERN +void +os_aio_wait_until_no_pending_writes(void) +/*=====================================*/ +{ + ut_ad(!srv_read_only_mode); + os_event_wait(os_aio_write_array->is_empty); +} + +/**********************************************************************//** +Calculates segment number for a slot. +@return segment number (which is the number used by, for example, +i/o-handler threads) */ +static +ulint +os_aio_get_segment_no_from_slot( +/*============================*/ + os_aio_array_t* array, /*!< in: aio wait array */ + os_aio_slot_t* slot) /*!< in: slot in this array */ +{ + ulint segment; + ulint seg_len; + + if (array == os_aio_ibuf_array) { + ut_ad(!srv_read_only_mode); + + segment = IO_IBUF_SEGMENT; + + } else if (array == os_aio_log_array) { + ut_ad(!srv_read_only_mode); + + segment = IO_LOG_SEGMENT; + + } else if (array == os_aio_read_array) { + seg_len = os_aio_read_array->n_slots + / os_aio_read_array->n_segments; + + segment = (srv_read_only_mode ? 0 : 2) + slot->pos / seg_len; + } else { + ut_ad(!srv_read_only_mode); + ut_a(array == os_aio_write_array); + + seg_len = os_aio_write_array->n_slots + / os_aio_write_array->n_segments; + + segment = os_aio_read_array->n_segments + 2 + + slot->pos / seg_len; + } + + return(segment); +} + +/**********************************************************************//** +Calculates local segment number and aio array from global segment number. +@return local segment number within the aio array */ +static +ulint +os_aio_get_array_and_local_segment( +/*===============================*/ + os_aio_array_t** array, /*!< out: aio wait array */ + ulint global_segment)/*!< in: global segment number */ +{ + ulint segment; + + ut_a(global_segment < os_aio_n_segments); + + if (srv_read_only_mode) { + *array = os_aio_read_array; + + return(global_segment); + } else if (global_segment == IO_IBUF_SEGMENT) { + *array = os_aio_ibuf_array; + segment = 0; + + } else if (global_segment == IO_LOG_SEGMENT) { + *array = os_aio_log_array; + segment = 0; + + } else if (global_segment < os_aio_read_array->n_segments + 2) { + *array = os_aio_read_array; + + segment = global_segment - 2; + } else { + *array = os_aio_write_array; + + segment = global_segment - (os_aio_read_array->n_segments + 2); + } + + return(segment); +} + +/*******************************************************************//** +Requests for a slot in the aio array. If no slot is available, waits until +not_full-event becomes signaled. +@return pointer to slot */ +static +os_aio_slot_t* +os_aio_array_reserve_slot( +/*======================*/ + ulint type, /*!< in: OS_FILE_READ or OS_FILE_WRITE */ + os_aio_array_t* array, /*!< in: aio array */ + fil_node_t* message1,/*!< in: message to be passed along with + the aio operation */ + void* message2,/*!< in: message to be passed along with + the aio operation */ + os_file_t file, /*!< in: file handle */ + const char* name, /*!< in: name of the file or path as a + null-terminated string */ + void* buf, /*!< in: buffer where to read or from which + to write */ + os_offset_t offset, /*!< in: file offset */ + ulint len, /*!< in: length of the block to read or write */ + ulint space_id, + ibool page_compression, /*!< in: is page compression used + on this file space */ + ulint page_compression_level, /*!< page compression + level to be used */ + ibool page_encryption, /*!< in: is page encryption used + on this file space */ + ulint page_encryption_key, /*!< page encryption key + to be used */ + ulint* write_size)/*!< in/out: Actual write size initialized + after fist successfull trim + operation for this page and if + initialized we do not trim again if + actual page size does not decrease. */ +{ + os_aio_slot_t* slot = NULL; +#ifdef WIN_ASYNC_IO + OVERLAPPED* control; + +#elif defined(LINUX_NATIVE_AIO) + + struct iocb* iocb; + off_t aio_offset; + +#endif /* WIN_ASYNC_IO */ + ulint i; + ulint counter; + ulint slots_per_seg; + ulint local_seg; + +#ifdef WIN_ASYNC_IO + ut_a((len & 0xFFFFFFFFUL) == len); +#endif /* WIN_ASYNC_IO */ + + /* No need of a mutex. Only reading constant fields */ + slots_per_seg = array->n_slots / array->n_segments; + + /* We attempt to keep adjacent blocks in the same local + segment. This can help in merging IO requests when we are + doing simulated AIO */ + local_seg = (offset >> (UNIV_PAGE_SIZE_SHIFT + 6)) + % array->n_segments; + +loop: + os_mutex_enter(array->mutex); + + if (array->n_reserved == array->n_slots) { + os_mutex_exit(array->mutex); + + if (!srv_use_native_aio) { + /* If the handler threads are suspended, wake them + so that we get more slots */ + + os_aio_simulated_wake_handler_threads(); + } + + os_event_wait(array->not_full); + + goto loop; + } + + /* We start our search for an available slot from our preferred + local segment and do a full scan of the array. We are + guaranteed to find a slot in full scan. */ + for (i = local_seg * slots_per_seg, counter = 0; + counter < array->n_slots; + i++, counter++) { + + i %= array->n_slots; + + slot = os_aio_array_get_nth_slot(array, i); + + if (slot->reserved == FALSE) { + goto found; + } + } + + /* We MUST always be able to get hold of a reserved slot. */ + ut_error; + +found: + ut_a(slot->reserved == FALSE); + array->n_reserved++; + + if (array->n_reserved == 1) { + os_event_reset(array->is_empty); + } + + if (array->n_reserved == array->n_slots) { + os_event_reset(array->not_full); + } + + slot->reserved = TRUE; + slot->reservation_time = ut_time(); + slot->message1 = message1; + slot->message2 = message2; + slot->file = file; + slot->name = name; + slot->len = len; + slot->type = type; + slot->buf = static_cast(buf); + slot->offset = offset; + slot->io_already_done = FALSE; + slot->space_id = space_id; + + slot->page_compress_success = FALSE; + slot->page_encryption_success = FALSE; + + slot->write_size = write_size; + slot->page_compression_level = page_compression_level; + slot->page_compression = page_compression; + slot->page_encryption_key = page_encryption_key; + slot->page_encryption = page_encryption; + + /* If the space is page compressed and this is write operation + then we compress the page */ + if (message1 && type == OS_FILE_WRITE && page_compression ) { + ulint real_len = len; + byte* tmp = NULL; + + /* Release the array mutex while compressing */ + os_mutex_exit(array->mutex); + + // We allocate memory for page compressed buffer if and only + // if it is not yet allocated. + if (slot->page_buf == NULL) { + os_slot_alloc_page_buf(slot); + } + +#ifdef HAVE_LZO + if (innodb_compression_algorithm == 3 && slot->lzo_mem == NULL) { + os_slot_alloc_lzo_mem(slot); + } +#endif + + /* Call page compression */ + tmp = fil_compress_page(fil_node_get_space_id(slot->message1), + (byte *)buf, + slot->page_buf, + len, + page_compression_level, + &real_len, + slot->lzo_mem + ); + + /* If compression succeeded, set up the length and buffer */ + if (tmp != buf) { + len = real_len; + buf = slot->page_buf; + slot->len = real_len; + slot->page_compress_success = TRUE; + } else { + slot->page_compress_success = FALSE; + } + + /* Take array mutex back */ + os_mutex_enter(array->mutex); + + } //CMD + /* If the space is page encryption and this is write operation + then we encrypt the page */ + if (message1 && type == OS_FILE_WRITE && page_encryption ) { + ulint real_len = len; + byte* tmp = NULL; + + /* Release the array mutex while encrypting */ + os_mutex_exit(array->mutex); + + // We allocate memory for page encrypted buffer if and only + // if it is not yet allocated. + if (slot->page_buf2 == NULL) { + os_slot_alloc_page_buf2(slot); + } + + ut_ad(slot->page_buf2); + tmp = fil_encrypt_page(fil_node_get_space_id(slot->message1), (byte *)buf, slot->page_buf2, len, page_encryption_key, &real_len); + + /* If encryption succeeded, set up the length and buffer */ + if (tmp != buf) { + len = real_len; + buf = slot->page_buf2; + slot->len = real_len; + slot->page_encryption_success = TRUE; + } else { + slot->page_encryption_success = FALSE; + } + + /* Take array mutex back */ + os_mutex_enter(array->mutex); + } + +#ifdef WIN_ASYNC_IO + control = &slot->control; + control->Offset = (DWORD) offset & 0xFFFFFFFF; + control->OffsetHigh = (DWORD) (offset >> 32); + control->hEvent = 0; + slot->arr = array; + +#elif defined(LINUX_NATIVE_AIO) + + /* If we are not using native AIO skip this part. */ + if (!srv_use_native_aio) { + goto skip_native_aio; + } + + /* Check if we are dealing with 64 bit arch. + If not then make sure that offset fits in 32 bits. */ + aio_offset = (off_t) offset; + + ut_a(sizeof(aio_offset) >= sizeof(offset) + || ((os_offset_t) aio_offset) == offset); + + iocb = &slot->control; + + if (type == OS_FILE_READ) { + io_prep_pread(iocb, file, buf, len, aio_offset); + } else { + ut_a(type == OS_FILE_WRITE); + io_prep_pwrite(iocb, file, buf, len, aio_offset); + } + + iocb->data = (void*) slot; + slot->n_bytes = 0; + slot->ret = 0; + +skip_native_aio: +#endif /* LINUX_NATIVE_AIO */ + os_mutex_exit(array->mutex); + + return(slot); +} + +/*******************************************************************//** +Frees a slot in the aio array. */ +static +void +os_aio_array_free_slot( +/*===================*/ + os_aio_array_t* array, /*!< in: aio array */ + os_aio_slot_t* slot) /*!< in: pointer to slot */ +{ + os_mutex_enter(array->mutex); + + ut_ad(slot->reserved); + + slot->reserved = FALSE; + + array->n_reserved--; + + if (array->n_reserved == array->n_slots - 1) { + os_event_set(array->not_full); + } + + if (array->n_reserved == 0) { + os_event_set(array->is_empty); + } + +#ifdef LINUX_NATIVE_AIO + + if (srv_use_native_aio) { + memset(&slot->control, 0x0, sizeof(slot->control)); + slot->n_bytes = 0; + slot->ret = 0; + /*fprintf(stderr, "Freed up Linux native slot.\n");*/ + } else { + /* These fields should not be used if we are not + using native AIO. */ + ut_ad(slot->n_bytes == 0); + ut_ad(slot->ret == 0); + } + +#endif + os_mutex_exit(array->mutex); +} + +/**********************************************************************//** +Wakes up a simulated aio i/o-handler thread if it has something to do. */ +static +void +os_aio_simulated_wake_handler_thread( +/*=================================*/ + ulint global_segment) /*!< in: the number of the segment in the aio + arrays */ +{ + os_aio_array_t* array; + ulint segment; + + ut_ad(!srv_use_native_aio); + + segment = os_aio_get_array_and_local_segment(&array, global_segment); + + ulint n = array->n_slots / array->n_segments; + + segment *= n; + + /* Look through n slots after the segment * n'th slot */ + + os_mutex_enter(array->mutex); + + for (ulint i = 0; i < n; ++i) { + const os_aio_slot_t* slot; + + slot = os_aio_array_get_nth_slot(array, segment + i); + + if (slot->reserved) { + + /* Found an i/o request */ + + os_mutex_exit(array->mutex); + + os_event_t event; + + event = os_aio_segment_wait_events[global_segment]; + + os_event_set(event); + + return; + } + } + + os_mutex_exit(array->mutex); +} + +/**********************************************************************//** +Wakes up simulated aio i/o-handler threads if they have something to do. */ +UNIV_INTERN +void +os_aio_simulated_wake_handler_threads(void) +/*=======================================*/ +{ + if (srv_use_native_aio) { + /* We do not use simulated aio: do nothing */ + + return; + } + + os_aio_recommend_sleep_for_read_threads = FALSE; + + for (ulint i = 0; i < os_aio_n_segments; i++) { + os_aio_simulated_wake_handler_thread(i); + } +} + +/**********************************************************************//** +This function can be called if one wants to post a batch of reads and +prefers an i/o-handler thread to handle them all at once later. You must +call os_aio_simulated_wake_handler_threads later to ensure the threads +are not left sleeping! */ +UNIV_INTERN +void +os_aio_simulated_put_read_threads_to_sleep(void) +/*============================================*/ +{ + +/* The idea of putting background IO threads to sleep is only for +Windows when using simulated AIO. Windows XP seems to schedule +background threads too eagerly to allow for coalescing during +readahead requests. */ +#ifdef __WIN__ + os_aio_array_t* array; + + if (srv_use_native_aio) { + /* We do not use simulated aio: do nothing */ + + return; + } + + os_aio_recommend_sleep_for_read_threads = TRUE; + + for (ulint i = 0; i < os_aio_n_segments; i++) { + os_aio_get_array_and_local_segment(&array, i); + + if (array == os_aio_read_array) { + + os_event_reset(os_aio_segment_wait_events[i]); + } + } +#endif /* __WIN__ */ +} + +#if defined(LINUX_NATIVE_AIO) +/*******************************************************************//** +Dispatch an AIO request to the kernel. +@return TRUE on success. */ +static +ibool +os_aio_linux_dispatch( +/*==================*/ + os_aio_array_t* array, /*!< in: io request array. */ + os_aio_slot_t* slot) /*!< in: an already reserved slot. */ +{ + int ret; + ulint io_ctx_index; + struct iocb* iocb; + + ut_ad(slot != NULL); + ut_ad(array); + + ut_a(slot->reserved); + + /* Find out what we are going to work with. + The iocb struct is directly in the slot. + The io_context is one per segment. */ + + iocb = &slot->control; + io_ctx_index = (slot->pos * array->n_segments) / array->n_slots; + + ret = io_submit(array->aio_ctx[io_ctx_index], 1, &iocb); + +#if defined(UNIV_AIO_DEBUG) + fprintf(stderr, + "io_submit[%c] ret[%d]: slot[%p] ctx[%p] seg[%lu]\n", + (slot->type == OS_FILE_WRITE) ? 'w' : 'r', ret, slot, + array->aio_ctx[io_ctx_index], (ulong) io_ctx_index); +#endif + + /* io_submit returns number of successfully + queued requests or -errno. */ + if (UNIV_UNLIKELY(ret != 1)) { + errno = -ret; + return(FALSE); + } + + return(TRUE); +} +#endif /* LINUX_NATIVE_AIO */ + + +/*******************************************************************//** +NOTE! Use the corresponding macro os_aio(), not directly this function! +Requests an asynchronous i/o operation. +@return TRUE if request was queued successfully, FALSE if fail */ +UNIV_INTERN +ibool +os_aio_func( +/*========*/ + ulint type, /*!< in: OS_FILE_READ or OS_FILE_WRITE */ + ulint mode, /*!< in: OS_AIO_NORMAL, ..., possibly ORed + to OS_AIO_SIMULATED_WAKE_LATER: the + last flag advises this function not to wake + i/o-handler threads, but the caller will + do the waking explicitly later, in this + way the caller can post several requests in + a batch; NOTE that the batch must not be + so big that it exhausts the slots in aio + arrays! NOTE that a simulated batch + may introduce hidden chances of deadlocks, + because i/os are not actually handled until + all have been posted: use with great + caution! */ + const char* name, /*!< in: name of the file or path as a + null-terminated string */ + os_file_t file, /*!< in: handle to a file */ + void* buf, /*!< in: buffer where to read or from which + to write */ + os_offset_t offset, /*!< in: file offset where to read or write */ + ulint n, /*!< in: number of bytes to read or write */ + fil_node_t* message1,/*!< in: message for the aio handler + (can be used to identify a completed + aio operation); ignored if mode is + OS_AIO_SYNC */ + void* message2,/*!< in: message for the aio handler + (can be used to identify a completed + aio operation); ignored if mode is + OS_AIO_SYNC */ + ulint space_id, + trx_t* trx, + ibool page_compression, /*!< in: is page compression used + on this file space */ + ulint page_compression_level, /*!< page compression + level to be used */ + ibool page_encryption, /*!< in: is page encryption used + on this file space */ + ulint page_encryption_key, /*!< page encryption key + to be used */ + ulint* write_size)/*!< in/out: Actual write size initialized + after fist successfull trim + operation for this page and if + initialized we do not trim again if + actual page size does not decrease. */ +{ + os_aio_array_t* array; + os_aio_slot_t* slot; +#ifdef WIN_ASYNC_IO + DWORD len = (DWORD) n; + BOOL ret; +#endif + ulint wake_later; + + ut_ad(file); + ut_ad(buf); + ut_ad(n > 0); + ut_ad(n % OS_MIN_LOG_BLOCK_SIZE == 0); + ut_ad(offset % OS_MIN_LOG_BLOCK_SIZE == 0); + ut_ad(os_aio_validate_skip()); +#ifdef WIN_ASYNC_IO + ut_ad((n & 0xFFFFFFFFUL) == n); +#endif + + wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER; + mode = mode & (~OS_AIO_SIMULATED_WAKE_LATER); + + if (mode == OS_AIO_SYNC) + { + ibool ret; + /* This is actually an ordinary synchronous read or write: + no need to use an i/o-handler thread */ + + if (type == OS_FILE_READ) { + ret = os_file_read_func(file, buf, offset, n, trx, + page_compression); + } + else { + ut_ad(!srv_read_only_mode); + ut_a(type == OS_FILE_WRITE); + + ret = os_file_write(name, file, buf, offset, n); + } + ut_a(ret); + return ret; + } + +try_again: + switch (mode) { + case OS_AIO_NORMAL: + if (type == OS_FILE_READ) { + array = os_aio_read_array; + } else { + ut_ad(!srv_read_only_mode); + array = os_aio_write_array; + } + break; + case OS_AIO_IBUF: + ut_ad(type == OS_FILE_READ); + /* Reduce probability of deadlock bugs in connection with ibuf: + do not let the ibuf i/o handler sleep */ + + wake_later = FALSE; + + if (srv_read_only_mode) { + array = os_aio_read_array; + } else { + array = os_aio_ibuf_array; + } + break; + case OS_AIO_LOG: + if (srv_read_only_mode) { + array = os_aio_read_array; + } else { + array = os_aio_log_array; + } + break; + case OS_AIO_SYNC: + array = os_aio_sync_array; +#if defined(LINUX_NATIVE_AIO) + /* In Linux native AIO we don't use sync IO array. */ + ut_a(!srv_use_native_aio); +#endif /* LINUX_NATIVE_AIO */ + break; + default: + ut_error; + array = NULL; /* Eliminate compiler warning */ + } + + if (trx && type == OS_FILE_READ) + { + trx->io_reads++; + trx->io_read += n; + } + slot = os_aio_array_reserve_slot(type, array, message1, message2, file, + name, buf, offset, n, space_id, + page_compression, page_compression_level, + page_encryption, page_encryption_key, write_size); + if (type == OS_FILE_READ) { + if (srv_use_native_aio) { + os_n_file_reads++; + os_bytes_read_since_printout += n; +#ifdef WIN_ASYNC_IO + ret = ReadFile(file, buf, (DWORD) n, &len, + &(slot->control)); + if(!ret && GetLastError() != ERROR_IO_PENDING) + goto err_exit; + +#elif defined(LINUX_NATIVE_AIO) + if (!os_aio_linux_dispatch(array, slot)) { + goto err_exit; + } +#endif /* WIN_ASYNC_IO */ + } else { + if (!wake_later) { + os_aio_simulated_wake_handler_thread( + os_aio_get_segment_no_from_slot( + array, slot)); + } + } + } else if (type == OS_FILE_WRITE) { + ut_ad(!srv_read_only_mode); + if (srv_use_native_aio) { + os_n_file_writes++; +#ifdef WIN_ASYNC_IO + ret = WriteFile(file, buf, (DWORD) n, &len, + &(slot->control)); + + if(!ret && GetLastError() != ERROR_IO_PENDING) + goto err_exit; +#elif defined(LINUX_NATIVE_AIO) + if (!os_aio_linux_dispatch(array, slot)) { + goto err_exit; + } +#endif /* WIN_ASYNC_IO */ + } else { + if (!wake_later) { + os_aio_simulated_wake_handler_thread( + os_aio_get_segment_no_from_slot( + array, slot)); + } + } + } else { + ut_error; + } + + /* aio was queued successfully! */ + return(TRUE); + +#if defined LINUX_NATIVE_AIO || defined WIN_ASYNC_IO +err_exit: +#endif /* LINUX_NATIVE_AIO || WIN_ASYNC_IO */ + os_aio_array_free_slot(array, slot); + + if (os_file_handle_error( + name,type == OS_FILE_READ ? "aio read" : "aio write", __FILE__, __LINE__)) { + + goto try_again; + } + + return(FALSE); +} + +#ifdef WIN_ASYNC_IO +#define READ_SEGMENT(x) (x < srv_n_read_io_threads) +#define WRITE_SEGMENT(x) !READ_SEGMENT(x) + +/**********************************************************************//** +This function is only used in Windows asynchronous i/o. +Waits for an aio operation to complete. This function is used to wait the +for completed requests. The aio array of pending requests is divided +into segments. The thread specifies which segment or slot it wants to wait +for. NOTE: this function will also take care of freeing the aio slot, +therefore no other thread is allowed to do the freeing! +@return TRUE if the aio operation succeeded */ +UNIV_INTERN +ibool +os_aio_windows_handle( +/*==================*/ + ulint segment, /*!< in: the number of the segment in the aio + arrays to wait for; segment 0 is the ibuf + i/o thread, segment 1 the log i/o thread, + then follow the non-ibuf read threads, and as + the last are the non-ibuf write threads; if + this is ULINT_UNDEFINED, then it means that + sync aio is used, and this parameter is + ignored */ + ulint pos, /*!< this parameter is used only in sync aio: + wait for the aio slot at this position */ + fil_node_t**message1, /*!< out: the messages passed with the aio + request; note that also in the case where + the aio operation failed, these output + parameters are valid and can be used to + restart the operation, for example */ + void** message2, + ulint* type, /*!< out: OS_FILE_WRITE or ..._READ */ + ulint* space_id) +{ + ulint orig_seg = segment; + os_aio_slot_t* slot; + ibool ret_val; + BOOL ret; + DWORD len; + BOOL retry = FALSE; + ULONG_PTR key; + HANDLE port = READ_SEGMENT(segment)? read_completion_port : completion_port; + + for(;;) { + ret = GetQueuedCompletionStatus(port, &len, &key, + (OVERLAPPED **)&slot, INFINITE); + + /* If shutdown key was received, repost the shutdown message and exit */ + if (ret && (key == IOCP_SHUTDOWN_KEY)) { + PostQueuedCompletionStatus(port, 0, key, NULL); + os_thread_exit(NULL); + } + + if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) { + os_thread_exit(NULL); + } + + if(WRITE_SEGMENT(segment)&& slot->type == OS_FILE_READ) { + /* + Redirect read completions to the dedicated completion port + and thread. We need to split read and write threads. If we do not + do that, and just allow all io threads process all IO, it is possible + to get stuck in a deadlock in buffer pool code, + + Currently, the problem is solved this way - "write io" threads + always get all completion notifications, from both async reads and + writes. Write completion is handled in the same thread that gets it. + Read completion is forwarded via PostQueueCompletionStatus()) + to the second completion port dedicated solely to reads. One of the + "read io" threads waiting on this port will finally handle the IO. + + Forwarding IO completion this way costs a context switch , and this + seems tolerable since asynchronous reads are by far less frequent. + */ + ut_a(PostQueuedCompletionStatus(read_completion_port, len, key, + &slot->control)); + } + else { + break; + } + } + + *message1 = slot->message1; + *message2 = slot->message2; + + *type = slot->type; + *space_id = slot->space_id; + + if (ret && len == slot->len) { + + ret_val = TRUE; + } else if (os_file_handle_error(slot->name, "Windows aio", __FILE__, __LINE__)) { + + retry = TRUE; + } else { + + ret_val = FALSE; + } + + if (retry) { + /* retry failed read/write operation synchronously. + No need to hold array->mutex. */ + +#ifdef UNIV_PFS_IO + /* This read/write does not go through os_file_read + and os_file_write APIs, need to register with + performance schema explicitly here. */ + struct PSI_file_locker* locker = NULL; + register_pfs_file_io_begin(locker, slot->file, slot->len, + (slot->type == OS_FILE_WRITE) + ? PSI_FILE_WRITE + : PSI_FILE_READ, + __FILE__, __LINE__); +#endif + + ut_a((slot->len & 0xFFFFFFFFUL) == slot->len); + + switch (slot->type) { + case OS_FILE_WRITE: + if (slot->message1 && slot->page_compression && slot->page_buf) { + ret_val = os_file_write(slot->name, slot->file, slot->page_buf, + slot->offset, slot->len); + } else { + + ret_val = os_file_write(slot->name, slot->file, slot->buf, + slot->offset, slot->len); + } + break; + case OS_FILE_READ: + ret_val = os_file_read(slot->file, slot->buf, + slot->offset, slot->len, slot->page_compression); + break; + default: + ut_error; + } + +#ifdef UNIV_PFS_IO + register_pfs_file_io_end(locker, len); +#endif + + if (!ret && GetLastError() == ERROR_IO_PENDING) { + /* aio was queued successfully! + We want a synchronous i/o operation on a + file where we also use async i/o: in Windows + we must use the same wait mechanism as for + async i/o */ + + ret = GetOverlappedResult(slot->file, + &(slot->control), + &len, TRUE); + } + + ret_val = ret && len == slot->len; + } + + if (slot->message1 && slot->page_compression) { + // We allocate memory for page compressed buffer if and only + // if it is not yet allocated. + if (slot->page_buf == NULL) { + os_slot_alloc_page_buf(slot); + } + +#ifdef HAVE_LZO + if (innodb_compression_algorithm == 3 && slot->lzo_mem == NULL) { + os_slot_alloc_lzo_mem(slot); + } +#endif + if (slot->type == OS_FILE_READ) { + fil_decompress_page(slot->page_buf, slot->buf, slot->len, slot->write_size); + } else { + if (slot->page_compress_success && fil_page_is_compressed(slot->page_buf)) { + if (srv_use_trim && os_fallocate_failed == FALSE) { + // Deallocate unused blocks from file system + os_file_trim(slot->file, slot, slot->len); + } + } + } + } + + os_aio_array_free_slot((os_aio_array_t *)slot->arr, slot); + + return(ret_val); +} +#endif + +#if defined(LINUX_NATIVE_AIO) +/******************************************************************//** +This function is only used in Linux native asynchronous i/o. This is +called from within the io-thread. If there are no completed IO requests +in the slot array, the thread calls this function to collect more +requests from the kernel. +The io-thread waits on io_getevents(), which is a blocking call, with +a timeout value. Unless the system is very heavy loaded, keeping the +io-thread very busy, the io-thread will spend most of its time waiting +in this function. +The io-thread also exits in this function. It checks server status at +each wakeup and that is why we use timed wait in io_getevents(). */ +static +void +os_aio_linux_collect( +/*=================*/ + os_aio_array_t* array, /*!< in/out: slot array. */ + ulint segment, /*!< in: local segment no. */ + ulint seg_size) /*!< in: segment size. */ +{ + int i; + int ret; + ulint start_pos; + ulint end_pos; + struct timespec timeout; + struct io_event* events; + struct io_context* io_ctx; + + /* sanity checks. */ + ut_ad(array != NULL); + ut_ad(seg_size > 0); + ut_ad(segment < array->n_segments); + + /* Which part of event array we are going to work on. */ + events = &array->aio_events[segment * seg_size]; + + /* Which io_context we are going to use. */ + io_ctx = array->aio_ctx[segment]; + + /* Starting point of the segment we will be working on. */ + start_pos = segment * seg_size; + + /* End point. */ + end_pos = start_pos + seg_size; + +retry: + + /* Initialize the events. The timeout value is arbitrary. + We probably need to experiment with it a little. */ + memset(events, 0, sizeof(*events) * seg_size); + timeout.tv_sec = 0; + timeout.tv_nsec = OS_AIO_REAP_TIMEOUT; + + ret = io_getevents(io_ctx, 1, seg_size, events, &timeout); + + if (ret > 0) { + for (i = 0; i < ret; i++) { + os_aio_slot_t* slot; + struct iocb* control; + + control = (struct iocb*) events[i].obj; + ut_a(control != NULL); + + slot = (os_aio_slot_t*) control->data; + + /* Some sanity checks. */ + ut_a(slot != NULL); + ut_a(slot->reserved); + +#if defined(UNIV_AIO_DEBUG) + fprintf(stderr, + "io_getevents[%c]: slot[%p] ctx[%p]" + " seg[%lu]\n", + (slot->type == OS_FILE_WRITE) ? 'w' : 'r', + slot, io_ctx, segment); +#endif + + /* We are not scribbling previous segment. */ + ut_a(slot->pos >= start_pos); + + /* We have not overstepped to next segment. */ + ut_a(slot->pos < end_pos); + + /* If the table is page compressed and this is read, + we decompress before we annouce the read is + complete. For writes, we free the compressed page. */ + if (slot->message1 && slot->page_compression) { + // We allocate memory for page compressed buffer if and only + // if it is not yet allocated. + if (slot->page_buf == NULL) { + os_slot_alloc_page_buf(slot); + } + +#ifdef HAVE_LZO + if (innodb_compression_algorithm == 3 && slot->lzo_mem == NULL) { + os_slot_alloc_lzo_mem(slot); + } +#endif + if (slot->type == OS_FILE_READ) { + fil_decompress_page(slot->page_buf, slot->buf, slot->len, slot->write_size); + } else { + if (slot->page_compress_success && + fil_page_is_compressed(slot->page_buf)) { + ut_ad(slot->page_compression_page); + if (srv_use_trim && os_fallocate_failed == FALSE) { + // Deallocate unused blocks from file system + os_file_trim(slot->file, slot, slot->len); + } + } + } + } + + /* page encryption */ + if (slot->message1 && slot->page_encryption) { + if (slot->page_buf2==NULL) { + os_slot_alloc_page_buf2(slot); + } + + ut_ad(slot->page_buf2); + + if (slot->type == OS_FILE_READ) { + if (fil_page_is_encrypted(slot->buf)) { + fil_decrypt_page(slot->page_buf2, slot->buf, slot->len, slot->write_size); + } + } else { + if (slot->page_encryption_success && + fil_page_is_encrypted(slot->page_buf2)) { + ut_ad(slot->page_encryption_page); + if (srv_use_trim && os_fallocate_failed == FALSE) { + // Deallocate unused blocks from file system + os_file_trim(slot->file, slot, slot->len); + } + } + } + } + + /* Mark this request as completed. The error handling + will be done in the calling function. */ + os_mutex_enter(array->mutex); + slot->n_bytes = events[i].res; + slot->ret = events[i].res2; + slot->io_already_done = TRUE; + os_mutex_exit(array->mutex); + } + return; + } + + if (UNIV_UNLIKELY(srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS)) { + return; + } + + /* This error handling is for any error in collecting the + IO requests. The errors, if any, for any particular IO + request are simply passed on to the calling routine. */ + + switch (ret) { + case -EAGAIN: + /* Not enough resources! Try again. */ + case -EINTR: + /* Interrupted! I have tested the behaviour in case of an + interrupt. If we have some completed IOs available then + the return code will be the number of IOs. We get EINTR only + if there are no completed IOs and we have been interrupted. */ + case 0: + /* No pending request! Go back and check again. */ + goto retry; + } + + /* All other errors should cause a trap for now. */ + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: unexpected ret_code[%d] from io_getevents()!\n", + ret); + ut_error; +} + +/**********************************************************************//** +This function is only used in Linux native asynchronous i/o. +Waits for an aio operation to complete. This function is used to wait for +the completed requests. The aio array of pending requests is divided +into segments. The thread specifies which segment or slot it wants to wait +for. NOTE: this function will also take care of freeing the aio slot, +therefore no other thread is allowed to do the freeing! +@return TRUE if the IO was successful */ +UNIV_INTERN +ibool +os_aio_linux_handle( +/*================*/ + ulint global_seg, /*!< in: segment number in the aio array + to wait for; segment 0 is the ibuf + i/o thread, segment 1 is log i/o thread, + then follow the non-ibuf read threads, + and the last are the non-ibuf write + threads. */ + fil_node_t**message1, /*!< out: the messages passed with the */ + void** message2, /*!< aio request; note that in case the + aio operation failed, these output + parameters are valid and can be used to + restart the operation. */ + ulint* type, /*!< out: OS_FILE_WRITE or ..._READ */ + ulint* space_id) +{ + ulint segment; + os_aio_array_t* array; + os_aio_slot_t* slot; + ulint n; + ulint i; + ibool ret = FALSE; + + /* Should never be doing Sync IO here. */ + ut_a(global_seg != ULINT_UNDEFINED); + + /* Find the array and the local segment. */ + segment = os_aio_get_array_and_local_segment(&array, global_seg); + n = array->n_slots / array->n_segments; + + wait_for_event: + /* Loop until we have found a completed request. */ + for (;;) { + ibool any_reserved = FALSE; + os_mutex_enter(array->mutex); + for (i = 0; i < n; ++i) { + slot = os_aio_array_get_nth_slot( + array, i + segment * n); + if (!slot->reserved) { + continue; + } else if (slot->io_already_done) { + /* Something for us to work on. */ + goto found; + } else { + any_reserved = TRUE; + } + } + + os_mutex_exit(array->mutex); + + /* There is no completed request. + If there is no pending request at all, + and the system is being shut down, exit. */ + if (UNIV_UNLIKELY + (!any_reserved + && srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS)) { + *message1 = NULL; + *message2 = NULL; + return(TRUE); + } + + /* Wait for some request. Note that we return + from wait iff we have found a request. */ + + srv_set_io_thread_op_info(global_seg, + "waiting for completed aio requests"); + os_aio_linux_collect(array, segment, n); + } + +found: + /* Note that it may be that there are more then one completed + IO requests. We process them one at a time. We may have a case + here to improve the performance slightly by dealing with all + requests in one sweep. */ + srv_set_io_thread_op_info(global_seg, + "processing completed aio requests"); + + /* Ensure that we are scribbling only our segment. */ + ut_a(i < n); + + ut_ad(slot != NULL); + ut_ad(slot->reserved); + ut_ad(slot->io_already_done); + + *message1 = slot->message1; + *message2 = slot->message2; + + *type = slot->type; + *space_id = slot->space_id; + + if (slot->ret == 0 && slot->n_bytes == (long) slot->len) { + + ret = TRUE; + } else if ((slot->ret == 0) && (slot->n_bytes > 0) + && (slot->n_bytes < (long) slot->len)) { + /* Partial read or write scenario */ + int submit_ret; + struct iocb* iocb; + slot->buf = (byte*)slot->buf + slot->n_bytes; + slot->offset = slot->offset + slot->n_bytes; + slot->len = slot->len - slot->n_bytes; + /* Resetting the bytes read/written */ + slot->n_bytes = 0; + slot->io_already_done = FALSE; + iocb = &(slot->control); + + if (slot->type == OS_FILE_READ) { + io_prep_pread(&slot->control, slot->file, slot->buf, + slot->len, (off_t) slot->offset); + } else { + ut_a(slot->type == OS_FILE_WRITE); + io_prep_pwrite(&slot->control, slot->file, slot->buf, + slot->len, (off_t) slot->offset); + } + /* Resubmit an I/O request */ + submit_ret = io_submit(array->aio_ctx[segment], 1, &iocb); + if (submit_ret < 0 ) { + /* Aborting in case of submit failure */ + ib_logf(IB_LOG_LEVEL_FATAL, + "Native Linux AIO interface. io_submit()" + " call failed when resubmitting a partial" + " I/O request on the file %s.", + slot->name); + } else { + ret = FALSE; + os_mutex_exit(array->mutex); + goto wait_for_event; + } + } else { + errno = -slot->ret; + + if (slot->ret == 0) { + fprintf(stderr, + "InnoDB: Number of bytes after aio %d requested %lu\n" + "InnoDB: from file %s\n", + slot->n_bytes, slot->len, slot->name); + } + + /* os_file_handle_error does tell us if we should retry + this IO. As it stands now, we don't do this retry when + reaping requests from a different context than + the dispatcher. This non-retry logic is the same for + windows and linux native AIO. + We should probably look into this to transparently + re-submit the IO. */ + os_file_handle_error(slot->name, "Linux aio", __FILE__, __LINE__); + + ret = FALSE; + } + + os_mutex_exit(array->mutex); + + os_aio_array_free_slot(array, slot); + + return(ret); +} +#endif /* LINUX_NATIVE_AIO */ + +/**********************************************************************//** +Does simulated aio. This function should be called by an i/o-handler +thread. +@return TRUE if the aio operation succeeded */ +UNIV_INTERN +ibool +os_aio_simulated_handle( +/*====================*/ + ulint global_segment, /*!< in: the number of the segment in the aio + arrays to wait for; segment 0 is the ibuf + i/o thread, segment 1 the log i/o thread, + then follow the non-ibuf read threads, and as + the last are the non-ibuf write threads */ + fil_node_t**message1, /*!< out: the messages passed with the aio + request; note that also in the case where + the aio operation failed, these output + parameters are valid and can be used to + restart the operation, for example */ + void** message2, + ulint* type, /*!< out: OS_FILE_WRITE or ..._READ */ + ulint* space_id) +{ + os_aio_array_t* array; + ulint segment; + os_aio_slot_t* consecutive_ios[OS_AIO_MERGE_N_CONSECUTIVE]; + ulint n_consecutive; + ulint total_len; + ulint offs; + os_offset_t lowest_offset; + ulint biggest_age; + ulint age; + byte* combined_buf; + byte* combined_buf2; + ibool ret; + ibool any_reserved; + ulint n; + os_aio_slot_t* aio_slot; + + /* Fix compiler warning */ + *consecutive_ios = NULL; + + segment = os_aio_get_array_and_local_segment(&array, global_segment); + +restart: + /* NOTE! We only access constant fields in os_aio_array. Therefore + we do not have to acquire the protecting mutex yet */ + + srv_set_io_thread_op_info(global_segment, + "looking for i/o requests (a)"); + ut_ad(os_aio_validate_skip()); + ut_ad(segment < array->n_segments); + + n = array->n_slots / array->n_segments; + + /* Look through n slots after the segment * n'th slot */ + + if (array == os_aio_read_array + && os_aio_recommend_sleep_for_read_threads) { + + /* Give other threads chance to add several i/os to the array + at once. */ + + goto recommended_sleep; + } + + srv_set_io_thread_op_info(global_segment, + "looking for i/o requests (b)"); + + /* Check if there is a slot for which the i/o has already been + done */ + any_reserved = FALSE; + + os_mutex_enter(array->mutex); + + for (ulint i = 0; i < n; i++) { + os_aio_slot_t* slot; + + slot = os_aio_array_get_nth_slot(array, i + segment * n); + + if (!slot->reserved) { + continue; + } else if (slot->io_already_done) { + + if (os_aio_print_debug) { + fprintf(stderr, + "InnoDB: i/o for slot %lu" + " already done, returning\n", + (ulong) i); + } + + aio_slot = slot; + ret = TRUE; + goto slot_io_done; + } else { + any_reserved = TRUE; + } + } + + /* There is no completed request. + If there is no pending request at all, + and the system is being shut down, exit. */ + if (!any_reserved && srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) { + os_mutex_exit(array->mutex); + *message1 = NULL; + *message2 = NULL; + return(TRUE); + } + + n_consecutive = 0; + + /* If there are at least 2 seconds old requests, then pick the oldest + one to prevent starvation. If several requests have the same age, + then pick the one at the lowest offset. */ + + biggest_age = 0; + lowest_offset = IB_UINT64_MAX; + + for (ulint i = 0; i < n; i++) { + os_aio_slot_t* slot; + + slot = os_aio_array_get_nth_slot(array, i + segment * n); + + if (slot->reserved) { + + age = (ulint) difftime( + ut_time(), slot->reservation_time); + + if ((age >= 2 && age > biggest_age) + || (age >= 2 && age == biggest_age + && slot->offset < lowest_offset)) { + + /* Found an i/o request */ + consecutive_ios[0] = slot; + + n_consecutive = 1; + + biggest_age = age; + lowest_offset = slot->offset; + } + } + } + + if (n_consecutive == 0) { + /* There were no old requests. Look for an i/o request at the + lowest offset in the array (we ignore the high 32 bits of the + offset in these heuristics) */ + + lowest_offset = IB_UINT64_MAX; + + for (ulint i = 0; i < n; i++) { + os_aio_slot_t* slot; + + slot = os_aio_array_get_nth_slot( + array, i + segment * n); + + if (slot->reserved && slot->offset < lowest_offset) { + + /* Found an i/o request */ + consecutive_ios[0] = slot; + + n_consecutive = 1; + + lowest_offset = slot->offset; + } + } + } + + if (n_consecutive == 0) { + + /* No i/o requested at the moment */ + + goto wait_for_io; + } + + /* if n_consecutive != 0, then we have assigned + something valid to consecutive_ios[0] */ + ut_ad(n_consecutive != 0); + ut_ad(consecutive_ios[0] != NULL); + + aio_slot = consecutive_ios[0]; + + /* Check if there are several consecutive blocks to read or write */ + +consecutive_loop: + for (ulint i = 0; i < n; i++) { + os_aio_slot_t* slot; + + slot = os_aio_array_get_nth_slot(array, i + segment * n); + + if (slot->reserved + && slot != aio_slot + && slot->offset == aio_slot->offset + aio_slot->len + && slot->type == aio_slot->type + && slot->file == aio_slot->file) { + + /* Found a consecutive i/o request */ + + consecutive_ios[n_consecutive] = slot; + n_consecutive++; + + aio_slot = slot; + + if (n_consecutive < OS_AIO_MERGE_N_CONSECUTIVE) { + + goto consecutive_loop; + } else { + break; + } + } + } + + srv_set_io_thread_op_info(global_segment, "consecutive i/o requests"); + + /* We have now collected n_consecutive i/o requests in the array; + allocate a single buffer which can hold all data, and perform the + i/o */ + + total_len = 0; + aio_slot = consecutive_ios[0]; + + for (ulint i = 0; i < n_consecutive; i++) { + total_len += consecutive_ios[i]->len; + } + + if (n_consecutive == 1) { + /* We can use the buffer of the i/o request */ + combined_buf = aio_slot->buf; + combined_buf2 = NULL; + } else { + combined_buf2 = static_cast( + ut_malloc(total_len + UNIV_PAGE_SIZE)); + + ut_a(combined_buf2); + + combined_buf = static_cast( + ut_align(combined_buf2, UNIV_PAGE_SIZE)); + } + + /* We release the array mutex for the time of the i/o: NOTE that + this assumes that there is just one i/o-handler thread serving + a single segment of slots! */ + + os_mutex_exit(array->mutex); + + if (aio_slot->type == OS_FILE_WRITE && n_consecutive > 1) { + /* Copy the buffers to the combined buffer */ + offs = 0; + + for (ulint i = 0; i < n_consecutive; i++) { + + ut_memcpy(combined_buf + offs, consecutive_ios[i]->buf, + consecutive_ios[i]->len); + + offs += consecutive_ios[i]->len; + } + } + + srv_set_io_thread_op_info(global_segment, "doing file i/o"); + + /* Do the i/o with ordinary, synchronous i/o functions: */ + if (aio_slot->type == OS_FILE_WRITE) { + ut_ad(!srv_read_only_mode); + ret = os_file_write( + aio_slot->name, aio_slot->file, combined_buf, + aio_slot->offset, total_len); + } else { + ret = os_file_read( + aio_slot->file, combined_buf, + aio_slot->offset, total_len, + aio_slot->page_compression); + } + + ut_a(ret); + srv_set_io_thread_op_info(global_segment, "file i/o done"); + + if (aio_slot->type == OS_FILE_READ && n_consecutive > 1) { + /* Copy the combined buffer to individual buffers */ + offs = 0; + + for (ulint i = 0; i < n_consecutive; i++) { + + ut_memcpy(consecutive_ios[i]->buf, combined_buf + offs, + consecutive_ios[i]->len); + offs += consecutive_ios[i]->len; + } + } + + if (combined_buf2) { + ut_free(combined_buf2); + } + + os_mutex_enter(array->mutex); + + /* Mark the i/os done in slots */ + + for (ulint i = 0; i < n_consecutive; i++) { + consecutive_ios[i]->io_already_done = TRUE; + } + + /* We return the messages for the first slot now, and if there were + several slots, the messages will be returned with subsequent calls + of this function */ + +slot_io_done: + + ut_a(aio_slot->reserved); + + *message1 = aio_slot->message1; + *message2 = aio_slot->message2; + + *type = aio_slot->type; + *space_id = aio_slot->space_id; + + os_mutex_exit(array->mutex); + + os_aio_array_free_slot(array, aio_slot); + + return(ret); + +wait_for_io: + srv_set_io_thread_op_info(global_segment, "resetting wait event"); + + /* We wait here until there again can be i/os in the segment + of this thread */ + + os_event_reset(os_aio_segment_wait_events[global_segment]); + + os_mutex_exit(array->mutex); + +recommended_sleep: + srv_set_io_thread_op_info(global_segment, "waiting for i/o request"); + + os_event_wait(os_aio_segment_wait_events[global_segment]); + + goto restart; +} + +/**********************************************************************//** +Validates the consistency of an aio array. +@return true if ok */ +static +bool +os_aio_array_validate( +/*==================*/ + os_aio_array_t* array) /*!< in: aio wait array */ +{ + ulint i; + ulint n_reserved = 0; + + os_mutex_enter(array->mutex); + + ut_a(array->n_slots > 0); + ut_a(array->n_segments > 0); + + for (i = 0; i < array->n_slots; i++) { + os_aio_slot_t* slot; + + slot = os_aio_array_get_nth_slot(array, i); + + if (slot->reserved) { + n_reserved++; + ut_a(slot->len > 0); + } + } + + ut_a(array->n_reserved == n_reserved); + + os_mutex_exit(array->mutex); + + return(true); +} + +/**********************************************************************//** +Validates the consistency the aio system. +@return TRUE if ok */ +UNIV_INTERN +ibool +os_aio_validate(void) +/*=================*/ +{ + os_aio_array_validate(os_aio_read_array); + + if (os_aio_write_array != 0) { + os_aio_array_validate(os_aio_write_array); + } + + if (os_aio_ibuf_array != 0) { + os_aio_array_validate(os_aio_ibuf_array); + } + + if (os_aio_log_array != 0) { + os_aio_array_validate(os_aio_log_array); + } + + if (os_aio_sync_array != 0) { + os_aio_array_validate(os_aio_sync_array); + } + + return(TRUE); +} + +/**********************************************************************//** +Prints pending IO requests per segment of an aio array. +We probably don't need per segment statistics but they can help us +during development phase to see if the IO requests are being +distributed as expected. */ +static +void +os_aio_print_segment_info( +/*======================*/ + FILE* file, /*!< in: file where to print */ + ulint* n_seg, /*!< in: pending IO array */ + os_aio_array_t* array) /*!< in: array to process */ +{ + ulint i; + + ut_ad(array); + ut_ad(n_seg); + ut_ad(array->n_segments > 0); + + if (array->n_segments == 1) { + return; + } + + fprintf(file, " ["); + for (i = 0; i < array->n_segments; i++) { + if (i != 0) { + fprintf(file, ", "); + } + + fprintf(file, "%lu", n_seg[i]); + } + fprintf(file, "] "); +} + +/**********************************************************************//** +Prints info about the aio array. */ +UNIV_INTERN +void +os_aio_print_array( +/*==============*/ + FILE* file, /*!< in: file where to print */ + os_aio_array_t* array) /*!< in: aio array to print */ +{ + ulint n_reserved = 0; + ulint n_res_seg[SRV_MAX_N_IO_THREADS]; + + os_mutex_enter(array->mutex); + + ut_a(array->n_slots > 0); + ut_a(array->n_segments > 0); + + memset(n_res_seg, 0x0, sizeof(n_res_seg)); + + for (ulint i = 0; i < array->n_slots; ++i) { + os_aio_slot_t* slot; + ulint seg_no; + + slot = os_aio_array_get_nth_slot(array, i); + + seg_no = (i * array->n_segments) / array->n_slots; + + if (slot->reserved) { + ++n_reserved; + ++n_res_seg[seg_no]; + + ut_a(slot->len > 0); + } + } + + ut_a(array->n_reserved == n_reserved); + + fprintf(file, " %lu", (ulong) n_reserved); + + os_aio_print_segment_info(file, n_res_seg, array); + + os_mutex_exit(array->mutex); +} + +/**********************************************************************//** +Prints info of the aio arrays. */ +UNIV_INTERN +void +os_aio_print( +/*=========*/ + FILE* file) /*!< in: file where to print */ +{ + time_t current_time; + double time_elapsed; + double avg_bytes_read; + + for (ulint i = 0; i < srv_n_file_io_threads; ++i) { + fprintf(file, "I/O thread %lu state: %s (%s)", + (ulong) i, + srv_io_thread_op_info[i], + srv_io_thread_function[i]); + +#ifndef __WIN__ + if (os_aio_segment_wait_events[i]->is_set) { + fprintf(file, " ev set"); + } +#endif /* __WIN__ */ + + fprintf(file, "\n"); + } + + fputs("Pending normal aio reads:", file); + + os_aio_print_array(file, os_aio_read_array); + + if (os_aio_write_array != 0) { + fputs(", aio writes:", file); + os_aio_print_array(file, os_aio_write_array); + } + + if (os_aio_ibuf_array != 0) { + fputs(",\n ibuf aio reads:", file); + os_aio_print_array(file, os_aio_ibuf_array); + } + + if (os_aio_log_array != 0) { + fputs(", log i/o's:", file); + os_aio_print_array(file, os_aio_log_array); + } + + if (os_aio_sync_array != 0) { + fputs(", sync i/o's:", file); + os_aio_print_array(file, os_aio_sync_array); + } + + putc('\n', file); + current_time = ut_time(); + time_elapsed = 0.001 + difftime(current_time, os_last_printout); + + fprintf(file, + "Pending flushes (fsync) log: %lu; buffer pool: %lu\n" + "%lu OS file reads, %lu OS file writes, %lu OS fsyncs\n", + (ulong) fil_n_pending_log_flushes, + (ulong) fil_n_pending_tablespace_flushes, + (ulong) os_n_file_reads, + (ulong) os_n_file_writes, + (ulong) os_n_fsyncs); + + if (os_file_n_pending_preads != 0 || os_file_n_pending_pwrites != 0) { + fprintf(file, + "%lu pending preads, %lu pending pwrites\n", + (ulong) os_file_n_pending_preads, + (ulong) os_file_n_pending_pwrites); + } + + if (os_n_file_reads == os_n_file_reads_old) { + avg_bytes_read = 0.0; + } else { + avg_bytes_read = (double) os_bytes_read_since_printout + / (os_n_file_reads - os_n_file_reads_old); + } + + fprintf(file, + "%.2f reads/s, %lu avg bytes/read," + " %.2f writes/s, %.2f fsyncs/s\n", + (os_n_file_reads - os_n_file_reads_old) + / time_elapsed, + (ulong) avg_bytes_read, + (os_n_file_writes - os_n_file_writes_old) + / time_elapsed, + (os_n_fsyncs - os_n_fsyncs_old) + / time_elapsed); + + os_n_file_reads_old = os_n_file_reads; + os_n_file_writes_old = os_n_file_writes; + os_n_fsyncs_old = os_n_fsyncs; + os_bytes_read_since_printout = 0; + + os_last_printout = current_time; +} + +/**********************************************************************//** +Refreshes the statistics used to print per-second averages. */ +UNIV_INTERN +void +os_aio_refresh_stats(void) +/*======================*/ +{ + os_n_file_reads_old = os_n_file_reads; + os_n_file_writes_old = os_n_file_writes; + os_n_fsyncs_old = os_n_fsyncs; + os_bytes_read_since_printout = 0; + + os_last_printout = time(NULL); +} + +#ifdef UNIV_DEBUG +/**********************************************************************//** +Checks that all slots in the system have been freed, that is, there are +no pending io operations. +@return TRUE if all free */ +UNIV_INTERN +ibool +os_aio_all_slots_free(void) +/*=======================*/ +{ + os_aio_array_t* array; + ulint n_res = 0; + + array = os_aio_read_array; + + os_mutex_enter(array->mutex); + + n_res += array->n_reserved; + + os_mutex_exit(array->mutex); + + if (!srv_read_only_mode) { + ut_a(os_aio_write_array == 0); + + array = os_aio_write_array; + + os_mutex_enter(array->mutex); + + n_res += array->n_reserved; + + os_mutex_exit(array->mutex); + + ut_a(os_aio_ibuf_array == 0); + + array = os_aio_ibuf_array; + + os_mutex_enter(array->mutex); + + n_res += array->n_reserved; + + os_mutex_exit(array->mutex); + } + + ut_a(os_aio_log_array == 0); + + array = os_aio_log_array; + + os_mutex_enter(array->mutex); + + n_res += array->n_reserved; + + os_mutex_exit(array->mutex); + + array = os_aio_sync_array; + + os_mutex_enter(array->mutex); + + n_res += array->n_reserved; + + os_mutex_exit(array->mutex); + + if (n_res == 0) { + + return(TRUE); + } + + return(FALSE); +} +#endif /* UNIV_DEBUG */ + +#endif /* !UNIV_HOTBACKUP */ + +#ifdef _WIN32 +#include +#ifndef FSCTL_FILE_LEVEL_TRIM +#define FSCTL_FILE_LEVEL_TRIM CTL_CODE(FILE_DEVICE_FILE_SYSTEM, 130, METHOD_BUFFERED, FILE_WRITE_DATA) +typedef struct _FILE_LEVEL_TRIM_RANGE { + DWORDLONG Offset; + DWORDLONG Length; +} FILE_LEVEL_TRIM_RANGE, *PFILE_LEVEL_TRIM_RANGE; + +typedef struct _FILE_LEVEL_TRIM { + DWORD Key; + DWORD NumRanges; + FILE_LEVEL_TRIM_RANGE Ranges[1]; +} FILE_LEVEL_TRIM, *PFILE_LEVEL_TRIM; +#endif +#endif + +/**********************************************************************//** +Directly manipulate the allocated disk space by deallocating for the file referred to +by fd for the byte range starting at offset and continuing for len bytes. +Within the specified range, partial file system blocks are zeroed, and whole +file system blocks are removed from the file. After a successful call, +subsequent reads from this range will return zeroes. +@return true if success, false if error */ +UNIV_INTERN +ibool +os_file_trim( +/*=========*/ + os_file_t file, /*!< in: file to be trimmed */ + os_aio_slot_t* slot, /*!< in: slot structure */ + ulint len) /*!< in: length of area */ +{ + +#define SECT_SIZE 512 + size_t trim_len = UNIV_PAGE_SIZE - len; + os_offset_t off = slot->offset + len; + // len here should be alligned to sector size + ut_a((trim_len % SECT_SIZE) == 0); + ut_a((len % SECT_SIZE) == 0); + + // Nothing to do if trim length is zero or if actual write + // size is initialized and it is smaller than current write size. + // In first write if we trim we set write_size to actual bytes + // written and rest of the page is trimmed. In following writes + // there is no need to trim again if write_size only increases + // because rest of the page is already trimmed. If actual write + // size decreases we need to trim again. + if (trim_len == 0 || + (slot->write_size && + *slot->write_size > 0 && + len >= *slot->write_size)) { + +#ifdef UNIV_PAGECOMPRESS_DEBUG + fprintf(stderr, "Note: TRIM: write_size %lu trim_len %lu len %lu\n", + *slot->write_size, trim_len, len); +#endif + + if (*slot->write_size > 0 && len >= *slot->write_size) { + srv_stats.page_compressed_trim_op_saved.inc(); + } + + *slot->write_size = len; + + return (TRUE); + } + +#ifdef __linux__ +#if defined(FALLOC_FL_PUNCH_HOLE) && defined (FALLOC_FL_KEEP_SIZE) + int ret = fallocate(file, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, off, trim_len); + + if (ret) { + /* After first failure do not try to trim again */ + os_fallocate_failed = TRUE; + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: [Warning] fallocate call failed with error code %d.\n" + " InnoDB: start: %lx len: %lu payload: %lu\n" + " InnoDB: Disabling fallocate for now.\n", ret, (slot->offset+len), trim_len, len); + + os_file_handle_error_no_exit(slot->name, + " fallocate(FALLOC_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE) ", + FALSE, __FILE__, __LINE__); + + if (slot->write_size) { + *slot->write_size = 0; + } + + return (FALSE); + } else { + if (slot->write_size) { + *slot->write_size = len; + } + } +#else + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: [Warning] fallocate not supported on this installation." + " InnoDB: Disabling fallocate for now."); + os_fallocate_failed = TRUE; + if (slot->write_size) { + *slot->write_size = 0; + } + +#endif /* HAVE_FALLOCATE ... */ + +#elif defined(_WIN32) + FILE_LEVEL_TRIM flt; + flt.Key = 0; + flt.NumRanges = 1; + flt.Ranges[0].Offset = off; + flt.Ranges[0].Length = trim_len; + + BOOL ret = DeviceIoControl(file,FSCTL_FILE_LEVEL_TRIM,&flt, sizeof(flt), NULL, NULL, NULL, NULL); + + if (!ret) { + /* After first failure do not try to trim again */ + os_fallocate_failed = TRUE; + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: [Warning] fallocate call failed with error.\n" + " InnoDB: start: %lx len: %du payload: %lu\n" + " InnoDB: Disabling fallocate for now.\n", (slot->offset+len), trim_len, len); + + os_file_handle_error_no_exit(slot->name, + " DeviceIOControl(FSCTL_FILE_LEVEL_TRIM) ", + FALSE, __FILE__, __LINE__); + + if (slot->write_size) { + *slot->write_size = 0; + } + return (FALSE); + } else { + if (slot->write_size) { + *slot->write_size = len; + } + } +#endif + + srv_stats.page_compression_trim_sect512.add((trim_len / SECT_SIZE)); + srv_stats.page_compression_trim_sect4096.add((trim_len / (SECT_SIZE*8))); + srv_stats.page_compressed_trim_op.inc(); + + return (TRUE); + +} + +/**********************************************************************//** +Allocate memory for temporal buffer used for page encryption. This +buffer is freed later. */ +UNIV_INTERN +void +os_slot_alloc_page_buf2( +/*===================*/ + os_aio_slot_t* slot) /*!< in: slot structure */ +{ + byte* cbuf2; + byte* cbuf; + + cbuf2 = static_cast(ut_malloc(UNIV_PAGE_SIZE*2)); + cbuf = static_cast(ut_align(cbuf2, UNIV_PAGE_SIZE)); + slot->page_encryption_page = static_cast(cbuf2); + slot->page_buf2 = static_cast(cbuf); +} + +/**********************************************************************//** +Allocate memory for temporal buffer used for page compression. This +buffer is freed later. */ +UNIV_INTERN +void +os_slot_alloc_page_buf( +/*===================*/ + os_aio_slot_t* slot) /*!< in: slot structure */ +{ + byte* cbuf2; + byte* cbuf; + + ut_a(slot != NULL); + /* We allocate extra to avoid memory overwrite on compression */ + cbuf2 = static_cast(ut_malloc(UNIV_PAGE_SIZE*2)); + cbuf = static_cast(ut_align(cbuf2, UNIV_PAGE_SIZE)); + slot->page_compression_page = static_cast(cbuf2); + slot->page_buf = static_cast(cbuf); + ut_a(slot->page_buf != NULL); +} + +#ifdef HAVE_LZO +/**********************************************************************//** +Allocate memory for temporal memory used for page compression when +LZO compression method is used */ +UNIV_INTERN +void +os_slot_alloc_lzo_mem( +/*===================*/ + os_aio_slot_t* slot) /*!< in: slot structure */ +{ + ut_a(slot != NULL); + slot->lzo_mem = static_cast(ut_malloc(LZO1X_1_15_MEM_COMPRESS)); + ut_a(slot->lzo_mem != NULL); +} +#endif + diff --git a/storage/xtradb/os/os0file.cc.rej b/storage/xtradb/os/os0file.cc.rej new file mode 100644 index 0000000000000..6455224e46bc5 --- /dev/null +++ b/storage/xtradb/os/os0file.cc.rej @@ -0,0 +1,20 @@ +--- storage/xtradb/os/os0file.cc ++++ storage/xtradb/os/os0file.cc +@@ -3175,7 +3175,7 @@ + + if (fil_page_is_encrypted((byte *)buf)) { + // if (page_encryption) { +- fil_decrypt_page(NULL, (byte *)buf, n, NULL); ++ fil_decrypt_page(NULL, (byte *)buf, n, NULL, 0); + } + + +@@ -4692,7 +4692,7 @@ + + ut_ad(slot->page_buf2); + //FF +- tmp = fil_encrypt_page(fil_node_get_space_id(slot->message1), (byte *)buf, slot->page_buf2, len, page_encryption_key, &real_len); ++ tmp = fil_encrypt_page(fil_node_get_space_id(slot->message1), (byte *)buf, slot->page_buf2, len, page_encryption_key, &real_len, 0); + + /* If encryption succeeded, set up the length and buffer */ + if (tmp != buf) { diff --git a/storage/xtradb/srv/srv0mon.cc b/storage/xtradb/srv/srv0mon.cc index f276efdc021d6..0e0c3ebe4679a 100644 --- a/storage/xtradb/srv/srv0mon.cc +++ b/storage/xtradb/srv/srv0mon.cc @@ -1879,6 +1879,15 @@ srv_mon_process_existing_counter( case MONITOR_OVLD_PAGES_PAGE_COMPRESSION_ERROR: value = srv_stats.pages_page_compression_error; break; + case MONITOR_OVLD_PAGES_PAGE_ENCRYPTED: + value = srv_stats.pages_page_encrypted; + break; + case MONITOR_OVLD_PAGES_PAGE_DECRYPTED: + value = srv_stats.pages_page_decrypted; + break; + case MONITOR_OVLD_PAGES_PAGE_ENCRYPTION_ERROR: + value = srv_stats.pages_page_encryption_error; + break; default: ut_error; diff --git a/storage/xtradb/srv/srv0srv.cc b/storage/xtradb/srv/srv0srv.cc index a12a8b197fb06..b9bff1ddccac7 100644 --- a/storage/xtradb/srv/srv0srv.cc +++ b/storage/xtradb/srv/srv0srv.cc @@ -1907,6 +1907,10 @@ srv_export_innodb_status(void) export_vars.innodb_page_compressed_trim_op = srv_stats.page_compressed_trim_op; export_vars.innodb_page_compressed_trim_op_saved = srv_stats.page_compressed_trim_op_saved; export_vars.innodb_pages_page_decompressed = srv_stats.pages_page_decompressed; + export_vars.innodb_pages_page_compression_error = srv_stats.pages_page_compression_error; + export_vars.innodb_pages_page_decrypted = srv_stats.pages_page_decrypted; + export_vars.innodb_pages_page_encrypted = srv_stats.pages_page_encrypted; + export_vars.innodb_pages_page_encryption_error = srv_stats.pages_page_encryption_error; export_vars.innodb_defragment_compression_failures = btr_defragment_compression_failures; diff --git a/unittest/eperi/CMakeLists.txt b/unittest/eperi/CMakeLists.txt new file mode 100644 index 0000000000000..1ace9d684e46c --- /dev/null +++ b/unittest/eperi/CMakeLists.txt @@ -0,0 +1,70 @@ +# Copyright (C) 2014 eperi GmbH. All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; version 2 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR}/sql + ${PCRE_INCLUDES} + ${CMAKE_SOURCE_DIR}/include +# ${CMAKE_SOURCE_DIR}/include/mysql + ${CMAKE_SOURCE_DIR}/unittest/mytap + ${CMAKE_SOURCE_DIR}/extra/yassl/include + ${CMAKE_SOURCE_DIR}/storage/perfschema/unittest + ${CMAKE_SOURCE_DIR}/storage/perfschema + ${CMAKE_SOURCE_DIR}/storage/xtradb/include + ) + +MY_ADD_TESTS(eperi) +MY_ADD_TESTS(eperi_aes + LINK_LIBRARIES mysys_ssl dbug) + +MY_ADD_TESTS(EperiKeySingleton + EXT "cc" + LINK_LIBRARIES xtradb pcre mysys_ssl) + +MY_ADD_TESTS( + pageenc + EXT "cc" + LINK_LIBRARIES xtradb perfschema mysys mysys_ssl sql mysql) + + +file(COPY + ${CMAKE_CURRENT_LIST_DIR}/keys.txt + DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/) +file(COPY + ${CMAKE_CURRENT_LIST_DIR}/xaa + DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/) +file(COPY + ${CMAKE_CURRENT_LIST_DIR}/xab + DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/) +file(COPY + ${CMAKE_CURRENT_LIST_DIR}/xac + DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/) +file(COPY + ${CMAKE_CURRENT_LIST_DIR}/xad + DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/) +file(COPY + ${CMAKE_CURRENT_LIST_DIR}/xae + DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/) +file(COPY + ${CMAKE_CURRENT_LIST_DIR}/xaf + DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/) +file(COPY + ${CMAKE_CURRENT_LIST_DIR}/keys.enc + DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/) +file(COPY + ${CMAKE_CURRENT_LIST_DIR}/compressed + DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/) +file(COPY + ${CMAKE_CURRENT_LIST_DIR}/compressed_full + DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/) diff --git a/unittest/eperi/EperiKeySingleton-t.cc b/unittest/eperi/EperiKeySingleton-t.cc new file mode 100644 index 0000000000000..e64d5112b73b1 --- /dev/null +++ b/unittest/eperi/EperiKeySingleton-t.cc @@ -0,0 +1,47 @@ +/******************************************************************//** +@file EperiKeySingleton-t.cc +Implementation of single pattern to keep keys for encrypting/decrypting pages. + +Created 09/15/2014 Florin Fugaciu +***********************************************************************/ + + +#include "EperiKeySingleton-t.h" +#include +#include +#include +#include +#include +#include + + +EperiKeySingleton::EperiKeySingleton() { +} + +EperiKeySingleton::~EperiKeySingleton() { +} + + + + +void printEntry(struct keyentry *entry, uint id) +{ + if( NULL == entry) + printf("No such keyID = %d\n", id); + else + printf("%3u. id:%3u \tiv:%s \tkey:%s\n", id, entry->id, entry->iv, entry->key); +} + + +int main() +{ + + plan(1); + printf("%s\n", "main() EperiKeySingleton.cc"); + + KeySingleton& ksp = KeySingleton::getInstance( "keys.txt", "/home/denis", 1, "secret"); + printEntry(ksp.getKeys(0), 0); + + return EXIT_SUCCESS; + +} diff --git a/unittest/eperi/EperiKeySingleton-t.h b/unittest/eperi/EperiKeySingleton-t.h new file mode 100644 index 0000000000000..f50bb8b543192 --- /dev/null +++ b/unittest/eperi/EperiKeySingleton-t.h @@ -0,0 +1,17 @@ +/******************************************************************//** +@file EperiKeySingleton-t.h +Implementation of single pattern to keep keys for encrypting/decrypting pages. + +Created 09/15/2014 Florin Fugaciu +***********************************************************************/ + +#ifndef EPERIKEYSINGLETON_T_H_ +#define EPERIKEYSINGLETONPATTERN_T_H_ + +class EperiKeySingleton { +public: + EperiKeySingleton(); + virtual ~EperiKeySingleton(); +}; + +#endif /* EPERIKEYSINGLETON_T_H_ */ diff --git a/unittest/eperi/compressed b/unittest/eperi/compressed new file mode 100644 index 0000000000000..9ae0e192b4444 Binary files /dev/null and b/unittest/eperi/compressed differ diff --git a/unittest/eperi/compressed_full b/unittest/eperi/compressed_full new file mode 100644 index 0000000000000..5c9ffc29c60f7 Binary files /dev/null and b/unittest/eperi/compressed_full differ diff --git a/unittest/eperi/eperi-t.c b/unittest/eperi/eperi-t.c new file mode 100644 index 0000000000000..255dad36475ab --- /dev/null +++ b/unittest/eperi/eperi-t.c @@ -0,0 +1,10 @@ +#include + + +int +main(int argc __attribute__((unused)),char *argv[]) +{ + plan(1); + ok(1, "Most simple test ever"); + return 0; +} diff --git a/unittest/eperi/eperi_aes-t.c b/unittest/eperi/eperi_aes-t.c new file mode 100644 index 0000000000000..06d79ef14b482 --- /dev/null +++ b/unittest/eperi/eperi_aes-t.c @@ -0,0 +1,358 @@ +#define EP_UNIT_TEST 1 +#define UNIV_INLINE +typedef unsigned char byte; +typedef unsigned long int ulint; +typedef unsigned long int ibool; + +#include +#include +#include +#include +#include +#include "../../storage/xtradb/include/fil0pageencryption.h" + + + + + + +#define MY_AES_TEST_TEXTBLOCK "abcdefghijklmnopqrstuvwxyz\ + ABCDEFGHIJKLMNOPQRSTUVW\ + 1234567890ß^!\"§$%&/()=?`\ + öäüÖÄÜ+*#',.-;:_~’µ<>|³²¹¼\ + ½¬{[]}æ“¢ð€đŋħłµ”øþ@¶ſŧ↓„ł«»←\ + abcdefghijklmnopqrstuvwxyz\ + ABCDEFGHIJKLMNOPQRSTUVW\ + 1234567890ß^!\"§$%&/()=?`\ + öäüÖÄÜ+*#',.-;:_~’µ<>|³²¹¼\ + ½¬{[]}æ“¢ð€đŋħłµ”øþ@¶ſŧ↓„ł«»←\ + abcdefghijklmnopqrstuvwxyz\ + ABCDEFGHIJKLMNOPQRSTUVW\ + 1234567890ß^!\"§$%&/()=?`\ + öäüÖÄÜ+*#',.-;:_~’µ<>|³²¹¼\ + ½¬{[]}æ“¢ð€đŋħłµ”øþ@¶ſŧ↓„ł«»←\ + abcdefghijklmnopqrstuvwxyz\ + ABCDEFGHIJKLMNOPQRSTUVW\ + 1234567890ß^!\"§$%&/()=?`\ + öäüÖÄÜ+*#',.-;:_~’µ<>|³²¹¼\ + ½¬{[]}æ“¢ð€đŋħłµ”øþ@¶ſŧ↓„ł«»←\ + abcdefghijklmnopqrstuvwxyz\ + ABCDEFGHIJKLMNOPQRSTUVW\ + 1234567890ß^!\"§$%&/()=?`\ + öäüÖÄÜ+*#',.-;:_~’µ<>|³²¹¼\ + ½¬{[]}æ“¢ð€đŋħłµ”øþ@¶ſŧ↓„ł«»←\ + " + +#define MY_AES_TEST_JOSHUA " David Lightman: [typing] What is the primary goal?\ +Joshua: You should know, Professor. You programmed me.\ +David Lightman: Oh, come on.\ +David Lightman: [typing] What is the primary goal?\ +Joshua: To win the game.\ +" + + +byte* readFile(char* fileName) { +FILE *fileptr; +byte *buffer; +long filelen; + +fileptr = fopen(fileName, "rb"); // Open the file in binary mode +fseek(fileptr, 0, SEEK_END); // Jump to the end of the file +filelen = ftell(fileptr); // Get the current byte offset in the file +rewind(fileptr); // Jump back to the beginning of the file + +buffer = (char *)malloc((filelen+1)*sizeof(char)); // Enough memory for file + \0 +fread(buffer, filelen, 1, fileptr); // Read in the entire file +fclose(fileptr); // Close the file +return buffer; +} + +void +test_cbc_wrong_keylength() +{ + plan(2); + char* source = "Joshua: Shall we play a game"; + ulint s_len = (ulint)strlen(source); + char* key="899C0ECB592B2CEE46E64191B6E6DE9B97D8A8EEA43BEF78"; + uint8 k_len = 6; + char* iv = "F0974007D619466B9EBF8D4F6E302AA3"; + uint8 i_len = 16; + char* dest = (char *) malloc(2*s_len*sizeof(char)); + unsigned long int dest_len = 0; + + int rc = my_aes_encrypt_cbc(source, s_len, dest, &dest_len, key, k_len, iv, i_len); + ok(rc == -5, "Encryption - wrong keylength was detected."); + rc = my_aes_decrypt_cbc(source, s_len, dest, &dest_len, key, k_len, iv, i_len); + ok(rc == -5, "Decryption - wrong keylength was detected."); +} + +void +test_cbc_keysizes() +{ + plan(2); + char* source = MY_AES_TEST_JOSHUA; + ulint s_len = (ulint)strlen(source); + char* key="899C0ECB592B2CEE46E64191B6E6DE9B97D8A8EEA43BEF78"; + uint8 k_len = 24; + char* iv = "F0974007D619466B9EBF8D4F6E302AA3"; + uint8 i_len = 16; + char* dest = (char *) malloc(2*s_len*sizeof(char)); + ulint dest_len = 0; + my_aes_encrypt_cbc(source, s_len, dest, &dest_len, key, k_len, iv, i_len); + source = (char *) malloc(strlen(MY_AES_TEST_TEXTBLOCK) * sizeof(char)); + my_aes_decrypt_cbc(dest , strlen(dest), source, &dest_len, key, k_len, iv, i_len); + ok(strcmp(source, MY_AES_TEST_JOSHUA),"Decrypted text is identical to original text."); + + key="7B3B8DA94B77F91A6E05037B21AD5F6E86BD4657C45D97BC7FF14313A781B5A3"; + k_len = 32; + dest = (char *) malloc(2*s_len*sizeof(char)); + my_aes_encrypt_cbc(source, s_len, dest, &dest_len, key, k_len, iv, i_len); + source = (char *) malloc(strlen(MY_AES_TEST_TEXTBLOCK) * sizeof(char)); + my_aes_decrypt_cbc(dest , strlen(dest), source, &dest_len, key, k_len, iv, i_len); + ok(strcmp(source, MY_AES_TEST_JOSHUA),"Decrypted text is identical to original text."); + free(source); + free(dest); +} + +void +test_cbc_large() +{ + plan(1); + char* source = MY_AES_TEST_TEXTBLOCK; + ulint s_len = (ulint)strlen(source); + + char* key = "3C5DC9153A6FE5F22516E217C1603BF7"; + uint8 k_len = 16; + char* iv = "F0974007D619466B9EBF8D4F6E302AA3"; + uint8 i_len = 16; + char* dest = (char *) malloc( 2* s_len * sizeof(char)); + ulint dest_len = 0; + dump_buffer(10,source); + dump_buffer(10,dest); + my_aes_encrypt_cbc(source, s_len, dest, &dest_len, key, k_len, iv, i_len); + source = (char *) malloc(strlen(MY_AES_TEST_TEXTBLOCK) * sizeof(char)); + my_aes_decrypt_cbc(dest , strlen(dest), source, &dest_len, key, k_len, iv, i_len); + ok(strcmp(source, MY_AES_TEST_TEXTBLOCK),"Decrypted text is identical to original text."); + free(source); + free(dest); +} + +void +test_wrong_key() +{ + plan(1); + char* source = MY_AES_TEST_TEXTBLOCK; + ulint s_len = (ulint)strlen(source); + + char* key = "3C5DC9153A6FE5F22516E217C1603BF7"; + uint8 k_len = 16; + char* iv = "F0974007D619466B9EBF8D4F6E302AA3"; + uint8 i_len = 16; + char* dest = (char *) malloc( 2* s_len * sizeof(char)); + ulint dest_len = 0; + dump_buffer(10,source); + dump_buffer(10,dest); + my_aes_encrypt_cbc(source, s_len, dest, &dest_len, key, k_len, iv, i_len); + + iv = "F1A74007D619455B9EBF8D4F6E302AA3"; + source = (char *) malloc(strlen(MY_AES_TEST_TEXTBLOCK) * sizeof(char)); + my_aes_decrypt_cbc(dest , strlen(dest), source, &dest_len, key, k_len, iv, i_len); + ok(strcmp(source, MY_AES_TEST_TEXTBLOCK) != 0,"Using wrong iv results in wrong decryption."); + free(source); + free(dest); +} + +void +test_cbc() +{ + plan(1); + int i; + char source[20]; + for(i=0; i<20; i++) { + source[i] = 5; + } + ulint s_len = 20; + char dest[32]; + for(i=0; i<32; i++){ + dest[i]=0; + } + ulint dest_len = 0; + char* key = "583BE7F334F85E7D9DDB362E9AC38151"; + uint8 k_len = 16; + char* iv = "3325CC3F02203FB6B849990042E58BCB"; + uint8 i_len = 16; + int ec = my_aes_encrypt_cbc(source, s_len, &dest, &dest_len, key, k_len, iv, i_len); + ok(ec == AES_OK, "Checking return code."); + for(i=0; i<20; i++) { + source[i] = 0; + } + my_aes_decrypt_cbc(dest , dest_len, &source, &dest_len, key, k_len, iv, i_len); + ok(strcmp(source, "Beam me up, Scotty."),"Decrypted text is identical to original text."); + +} + +void +test_cbc_resultsize() +{ + plan(2); + char *source = (char*) malloc(5000*sizeof(char)); + source = "abcdefghijklmnopqrstfjdklfkjdsljsdlkfjsaklföjsfölkdsjfölsdkjklösjsdklfjdsklöfjsdalökfjdsklöjfölksdjfklösdajfklösdaj"; + ulint s_len = (ulint) strlen(source); + char* dest = (char *) malloc(2 * s_len * sizeof(char)); + ulint d_len = 0; + char* key = "583BE7F334F85E7D9DDB362E9AC38151"; + uint8 k_len = 16; + char* iv = "3325CC3F02203FB6B849990042E58BCB"; + uint8 i_len = 16; + my_aes_encrypt_cbc(source, s_len, dest, &d_len, key, k_len, iv, i_len); + ok(d_len==128, "Destination length ok."); +} + +void test_cbc_enc_dec() { + unsigned char inbuf[1024]="Hello,world!"; +unsigned char encbuf[1024]; + +unsigned char key32[] = {0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa}; +unsigned char deckey32[] = {0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa} +; +unsigned char iv[] = {0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00}; +unsigned char deciv[] = {0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00}; + +AES_KEY aeskey; +AES_KEY aesdeckey; + +//Now enrypt +memset(encbuf, 0, sizeof(encbuf)); +AES_set_encrypt_key(key32, 32*8, &aeskey); +AES_cbc_encrypt(inbuf, encbuf, 16, &aeskey, iv, AES_ENCRYPT); + +//Now decrypt +unsigned char decbuf[1024]; +memset(decbuf, 0, sizeof(decbuf)); + +AES_set_decrypt_key(deckey32, 32*8, &aesdeckey); +AES_cbc_encrypt(encbuf, decbuf, 16, &aesdeckey, deciv, AES_DECRYPT); + + +int i = memcmp(decbuf,inbuf,16); +ok (i==0, "in==out"); + +} + +void test_cbc_enc_dec2() { + unsigned char inbuf[1024]="Hello,world!"; +unsigned char encbuf[1024]; + +unsigned char key32[] = {0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa}; +unsigned char deckey32[] = {0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa} +; +unsigned char iv[] = {0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00}; +unsigned char deciv[] = {0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00}; + +AES_KEY aeskey; +AES_KEY aesdeckey; + +//Now enrypt +memset(encbuf, 0, sizeof(encbuf)); +AES_set_encrypt_key(key32, 32*8, &aeskey); +AES_cbc_encrypt(inbuf, encbuf, 16, &aeskey, iv, AES_ENCRYPT); + +//Now decrypt +unsigned char decbuf[1024]; +memset(decbuf, 0, sizeof(decbuf)); + +AES_set_decrypt_key(deckey32, 32*8, &aesdeckey); +AES_cbc_encrypt(encbuf, decbuf, 16, &aesdeckey, deciv, AES_DECRYPT); +dump_buffer(16, decbuf); +dump_buffer(16, encbuf); + +int i = memcmp(decbuf,inbuf,16); +ok (i==0, "in==out"); + +} + + + +void test_cbc_enc_() { + unsigned char inbuf[1024]="Hello,world!"; +unsigned char encbuf[1024]; + +unsigned char key32[] = {0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa}; +unsigned char deckey32[] = {0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa,0xaa} +; +unsigned char iv[] = {0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00}; +unsigned char deciv[] = {0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00}; + +AES_KEY aeskey; +AES_KEY aesdeckey; + +//Now enrypt +memset(encbuf, 0, sizeof(encbuf)); +AES_set_encrypt_key(key32, 32*8, &aeskey); +AES_cbc_encrypt(inbuf, encbuf, 16, &aeskey, iv, AES_ENCRYPT); + +//Now decrypt +unsigned char decbuf[1024]; +memset(decbuf, 0, sizeof(decbuf)); + +AES_set_decrypt_key(deckey32, 32*8, &aesdeckey); +AES_cbc_encrypt(encbuf, decbuf, 16, &aesdeckey, deciv, AES_DECRYPT); +dump_buffer(16, decbuf); +dump_buffer(16, encbuf); + +int i = memcmp(decbuf,inbuf,16); +ok (i==0, "in==out"); + +} + + + +void test_page_enc_dec() { + char* buf = readFile("xaa"); + char* dest = (char *) malloc(16384*sizeof(char)); + //fil_encrypt_page(0,buf,dest,0,0,NULL,1); + + //fil_decrypt_page(NULL, dest, 0,NULL,1); + + ulint i = memcmp(buf,dest, 16384); + ok (i==0, "in==out"); +} + +/* + * Test if bytes for AES Key and IV are generated in the same way as in openssl commandline. + */ +void +test_bytes_to_key() +{ + plan(2); + char salt[] = {0x0c, 0x3b, 0x72, 0x1b, 0xfe, 0x07, 0xe2, 0xb3}; + char *secret = "secret"; + char key[32]; + char iv[16]; + char keyresult[32] = {0x2E, 0xFF, 0xB7, 0x1D, 0xDB, 0x97, 0xA8, 0x3A, + 0x03, 0x5A, 0x06, 0xDF, 0xB0, 0xDD, 0x72, 0x29, + 0xA6, 0xD9, 0x1F, 0xFB, 0xE6, 0x06, 0x3B, 0x4B, + 0x81, 0x23, 0x85, 0x45, 0x71, 0x28, 0xFF, 0x1F}; + char ivresult[16] = {0x61, 0xFF, 0xC8, 0x27, 0x5B, 0x46, 0x4C, 0xBD, + 0x55, 0x82, 0x0E, 0x54, 0x8F, 0xE4, 0x44, 0xD9}; + + my_bytes_to_key(&salt, secret, &key, &iv); + + ok(memcmp(key, &keyresult, 32) == 0, "BytesToKey key generated successfully."); + ok(memcmp(iv, &ivresult, 16) == 0, "BytesToKey iv generated successfully."); +} + + +int +main(int argc __attribute__((unused)),char *argv[]) +{ + test_cbc(); + test_cbc_large(); + test_cbc_keysizes(); + test_cbc_wrong_keylength(); + test_cbc_resultsize(); + test_cbc_enc_dec(); + test_wrong_key(); + test_bytes_to_key(); + return 0; +} diff --git a/unittest/eperi/keys.enc b/unittest/eperi/keys.enc new file mode 100644 index 0000000000000..2774d58404cdd Binary files /dev/null and b/unittest/eperi/keys.enc differ diff --git a/unittest/eperi/keys.txt b/unittest/eperi/keys.txt new file mode 100644 index 0000000000000..b2608073b004f --- /dev/null +++ b/unittest/eperi/keys.txt @@ -0,0 +1,15 @@ +#Page encryption key file +#Each entry consists of ;; +1;F5502320F8429037B8DAEF761B189D12;770A8A65DA156D24EE2A093277530142 +2;35B2FF0795FB84BBD666DB8430CA214E;4D92199549E0F2EF009B4160F3582E5528A11A45017F3EF8 +3;7E892875A52C59A3B588306B13C31FBD;B374A26A71490437AA024E4FADD5B497FDFF1A8EA6FF12F6FB65AF2720B59CCF +15;7E892875A52C59A3B588306B13C31FBD;B374A26A71490437AA024E4FADD5B497FDFF1A8EA6FF12F6FB65AF2720B59CCF +#15;7E892875A52C59A3B588306B13C31FBD;B374A26A71490437AA024E4FADD5B497FDFF1A8EA6FF12F6FB65AF2720B59CCF +1024;7E892875A52C59A3B588306B13C31FBD;B374A26A71490437AA024E4FADD5B497FDFF1A8EA6FF12F6FB65AF2720B59CCF +3;7E892875A52C59A3B5883z6B13C31FBD;B374A26A71490437AA024E4FADD5B497FDFF1A8EA6FF12F6FB65AF2720B59CCF +255;F5502320F8429037B8DAEF761B189D12;770A8A65DA156D24EE2A093277530142 +256;F5502320F8429037B8DAEF761B189D12;770A8A65DA156D24EE2A093277530142 +4;F5502320F8429037B8DAEF761B189D12;770A8A65DA156D24EE2A093277530142770A8A65DA156D24EE2A093277530142770A8A65DA156D24EE2A093277530142770A8A65DA156D24EE2A093277530142770A8A65DA156D24EE2A093277530142770A8A65DA156D24EE2A093277530142770A8A65DA156D24EE2A093277530142770A8A65DA156D24EE2A093277530142770A8A65DA156D24EE2A093277530142770A8A65DA156D24EE2A093277530142770A8A65DA156D24EE2A093277530142770A8A65DA156D24EE2A093277530142770A8A65DA156D24EE2A093277530142770A8A65DA156D24EE2A093277530142770A8A65DA156D24EE2A093277530142770A8A65DA156D24EE2A093277530142770A8A65DA156D24EE2A093277530142770A8A65DA156D24EE2A093277530142770A8A65DA156D24EE2A093277530142770A8A65DA156D24EE2A093277530142770A8A65DA156D24EE2A093277530142770A8A65DA156D24EE2A093277530142770A8A65DA156D24EE2A093277530142770A8A65DA156D24EE2A093277530142 +5;021B0663D4DD7B54E2EBC852677E40BD;18420B5CBA31CCDFFE9716E91EB61374D05914F3ADE23E03 +6;9BF92CEA026CE732DA80821122A8CE97;966050D7777350B6FD5CCB3E5F648DA45C63BEFB6DEDDFA13443F156B7D35C84 +7;BC44D4AFD2D9FCD82A679E4DC6700D06;B5EA210C8C09EF20DB95EC584714A89F diff --git a/unittest/eperi/kf.txt b/unittest/eperi/kf.txt new file mode 100644 index 0000000000000..e468e2850e74f --- /dev/null +++ b/unittest/eperi/kf.txt @@ -0,0 +1 @@ +Florin diff --git a/unittest/eperi/kfo.txt b/unittest/eperi/kfo.txt new file mode 100644 index 0000000000000..b27dd3b900c93 --- /dev/null +++ b/unittest/eperi/kfo.txt @@ -0,0 +1 @@ +Salted__�Yc۸V��ʱ��T����/r��ҩs \ No newline at end of file diff --git a/unittest/eperi/my.cnf b/unittest/eperi/my.cnf new file mode 100644 index 0000000000000..09dd8d0449d2e --- /dev/null +++ b/unittest/eperi/my.cnf @@ -0,0 +1,94 @@ +# Example MySQL config file for small systems. +# +# This is for a system with little memory (<= 64M) where MySQL is only used +# from time to time and it's important that the mysqld daemon +# doesn't use much resources. +# +# MySQL programs look for option files in a set of +# locations which depend on the deployment platform. +# You can copy this option file to one of those +# locations. For information about these locations, see: +# http://dev.mysql.com/doc/mysql/en/option-files.html +# +# In this file, you can use all long options that a program supports. +# If you want to know which options a program supports, run the program +# with the "--help" option. + +# The following options will be passed to all MySQL clients +[client] +#password = your_password +port = 3306 +socket = /tmp/mysql.sock + +# Here follows entries for some specific programs + +# The MySQL server +[mysqld] +port = 3306 +socket = /tmp/mysql.sock +skip-external-locking +key_buffer_size = 16K +max_allowed_packet = 1M +table_open_cache = 4 +sort_buffer_size = 64K +read_buffer_size = 256K +read_rnd_buffer_size = 256K +net_buffer_length = 2K +thread_stack = 240K + +# Don't listen on a TCP/IP port at all. This can be a security enhancement, +# if all processes that need to connect to mysqld run on the same host. +# All interaction with mysqld must be made via Unix sockets or named pipes. +# Note that using this option without enabling named pipes on Windows +# (using the "enable-named-pipe" option) will render mysqld useless! +# +#skip-networking +server-id = 1 + +# Uncomment the following if you want to log updates +#log-bin=mysql-bin + +# binary logging format - mixed recommended +#binlog_format=mixed + +# Causes updates to non-transactional engines using statement format to be +# written directly to binary log. Before using this option make sure that +# there are no dependencies between transactional and non-transactional +# tables such as in the statement INSERT INTO t_myisam SELECT * FROM +# t_innodb; otherwise, slaves may diverge from the master. +#binlog_direct_non_transactional_updates=TRUE + +# Uncomment the following if you are using InnoDB tables +innodb_data_home_dir = /home/florin/w/cxx/build-mariadb/db/mysql/data +innodb_data_file_path = ibdata1:10M:autoextend +innodb_log_group_home_dir = /home/florin/w/cxx/build-mariadb/db/mysql/data +# You can set .._buffer_pool_size up to 50 - 80 % +# of RAM but beware of setting memory usage too high +innodb_buffer_pool_size = 16M +#innodb_additional_mem_pool_size = 2M +# Set .._log_file_size to 25 % of buffer pool size +innodb_log_file_size = 5M +innodb_log_buffer_size = 8M +innodb_flush_log_at_trx_commit = 1 +innodb_lock_wait_timeout = 50 + +#innodb_data_encryption_providertype = 1 +#innodb_data_encryption_providername = keys.txt +#innodb_data_encryption_providerurl = /home/florin/w/cxx/build-mariadb/unittest/eperi + + +[mysqldump] +quick +max_allowed_packet = 16M + +[mysql] +no-auto-rehash +# Remove the next comment character if you are not familiar with SQL +#safe-updates + +[myisamchk] +key_buffer_size = 8M +sort_buffer_size = 8M + +[mysqlhotcopy] +interactive-timeout diff --git a/unittest/eperi/pageenc-t.cc b/unittest/eperi/pageenc-t.cc new file mode 100644 index 0000000000000..397831eeabf83 --- /dev/null +++ b/unittest/eperi/pageenc-t.cc @@ -0,0 +1,125 @@ +/* + * pageenc.cc + * + * Created on: 23.08.2014 + * Author: florin + */ +//#define UNIV_INLINE +typedef unsigned char byte; +typedef unsigned long int ulint; +typedef unsigned long int ibool; + + + +#include "pageenc-t.h" +#include +#include + +#include + +extern int summef(int a, int b); +extern int summef2(int a, int b); +extern int multiplikation(int a, int b); + + +extern byte* +fil_encrypt_page( +/*==============*/ + ulint space_id, /*!< in: tablespace id of the + table. */ + byte* buf, /*!< in: buffer from which to write; in aio + this must be appropriately aligned */ + byte* out_buf, /*!< out: compressed buffer */ + ulint len, /*!< in: length of input buffer.*/ + ulint compression_level, /*!< in: compression level */ + ulint* out_len, /*!< out: actual length of compressed page */ + ulint mode /*!< in: calling mode */ + ); + +/****************************************************************//** +For page encrypted pages decrypt the page after actual read +operation. +@return decrypted page */ +extern ulint +fil_decrypt_page( +/*================*/ + byte* page_buf, /*!< in: preallocated buffer or NULL */ + byte* buf, /*!< out: buffer from which to read; in aio + this must be appropriately aligned */ + ulint len, /*!< in: length of output buffer.*/ + ulint* write_size, /*!< in/out: Actual payload size of the decrypted data. */ + ibool* page_compressed, + ulint mode /*!