Repository: AlexDenisov/bitcode_retriever Branch: master Commit: 2428f193e039 Files: 13 Total size: 21.7 KB Directory structure: gitextract_ewr5giqk/ ├── .gitignore ├── LICENSE ├── Makefile ├── README.md ├── macho_reader.c ├── macho_reader.h ├── macho_retriever.c ├── macho_retriever.h ├── macho_util.c ├── macho_util.h ├── main.c └── subject/ ├── main.c └── power2.c ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ build ================================================ FILE: LICENSE ================================================ The MIT License (MIT) Copyright (c) 2016 AlexDenisov Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: Makefile ================================================ CC=$(shell which clang) BUILD_DIR=build SUBJECT_DIR=subject SOURCE_FILES=main.c macho_retriever.c macho_reader.c macho_util.c RETRIEVER_BIN=$(BUILD_DIR)/bitcode_retriever all: $(BUILD_DIR) $(CC) $(SOURCE_FILES) -o $(RETRIEVER_BIN) -I/usr/include/libxml2 -lxar -lxml2 ### Testing OBJECT_FILES=i386.o x86_64.o subject: $(BUILD_DIR) fat.o @true fat.o: $(OBJECT_FILES) cd $(BUILD_DIR) && lipo -create $^ -output $@ %.o: $(CC) -fembed-bitcode -c -arch $* $(SUBJECT_DIR)/main.c -o $(BUILD_DIR)/main.o $(CC) -fembed-bitcode -c -arch $* $(SUBJECT_DIR)/power2.c -o $(BUILD_DIR)/power2.o $(CC) -fembed-bitcode -arch $* $(BUILD_DIR)/main.o $(BUILD_DIR)/power2.o -o $(BUILD_DIR)/$@ $(BUILD_DIR): mkdir $@ clean: rm -rf $(BUILD_DIR) ================================================ FILE: README.md ================================================ ## Bitcode Retriever Retrieves Bitcode from Mach-O binaries ### About [Bitcode](http://llvm.org/docs/BitCodeFormat.html) stores as an [xar](https://en.wikipedia.org/wiki/Xar_(archiver)) archive inside of a Mach-O binary. This tool extracts the archive and puts it near the binary, so it can be easily discovered using `xar` and `llvm-dis` ### Build Clone the repo and run `make`, built binary lays inside of `build` directory ```bash $ git clone git@github.com:AlexDenisov/bitcode_retriever.git $ cd bitcode_retriever $ make ``` ### Usage _Note: currently is does not work with static libraries, there is an opened issue #1, if you need this feature please a comment there, it will bump prioity of this project at my personal todo-list._ To use `bitcode_retriever` simple feed him your binary and it'll produce archive with bitcode. It accepts both fat and non-fat binaries. For fat binaries it produces separate archive for each slice, e.g.: ```bash $ bitcode_retriever fat_app i386.xar x86_64.xar arm64.xar ``` for non-fat binaries it produces just one archive with the bitcode: ```bash $ bitcode_retriever non_fat_app i386.xar ``` To skip the xar archive and obtain the bitcode immediately, pass the `-e` argument. ```bash $ bitcode_retriever -e fat_app i386.1.bc i386.2.bc x86_64.1.bc x86_64.2.bc ``` The project provides a sample binaries, you can play a bit with them: ```bash $ make subject $ cd build $ ./bitcode_retriever i386.o # or $ ./bitcode_retriever fat.o ``` The xar archive stores set of files with bitcode: ```bash $ xar -xf i386.o $ ls i386.o 1 2 ``` You can dump LLVM IR from each file (`1`, `2`) using [`llvm-dis`](http://llvm.org/docs/CommandGuide/llvm-dis.html) ```bash $ llvm-dis 1 $ llvm-dis 2 $ ls 1 2 1.ll 2.ll ``` ### Bugs and issues If you have any problems or found some bug - feel free to open an issue and/or send a pull request ================================================ FILE: macho_reader.c ================================================ #include "macho_reader.h" #include #include #include #include #include #include const static int uint32_size = sizeof(uint32_t); const static int fat_header_size = sizeof(struct fat_header); const static int fat_arch_size = sizeof(struct fat_arch); const static int mach_header_size = sizeof(struct mach_header); const static int mach_header_64_size = sizeof(struct mach_header_64); const static int load_command_size = sizeof(struct load_command); const static int segment_command_size = sizeof(struct segment_command); const static int segment_command_64_size = sizeof(struct segment_command_64); struct _cpu_type_names { cpu_type_t cputype; const char *cpu_name; }; static struct _cpu_type_names cpu_type_names[] = { { CPU_TYPE_I386, "i386" }, { CPU_TYPE_X86_64, "x86_64" }, { CPU_TYPE_ARM, "arm" }, { CPU_TYPE_ARM64, "arm64" } }; int get_cpu_type_count() { return (int)(sizeof(cpu_type_names) / sizeof(cpu_type_names[0])); } static const char *cpu_type_name(cpu_type_t cpu_type) { static int cpu_type_names_size = sizeof(cpu_type_names) / sizeof(struct _cpu_type_names); for (int i = 0; i < cpu_type_names_size; i++ ) { if (cpu_type == cpu_type_names[i].cputype) { return cpu_type_names[i].cpu_name; } } return "unknown"; } const char *get_cpu_type_name(struct mach_header *header) { return cpu_type_name(header->cputype); } const char *get_cpu_type_name_64(struct mach_header_64 *header) { return cpu_type_name(header->cputype); } uint32_t get_magic(FILE *stream, int offset) { uint32_t magic = 0; fseek(stream, offset, SEEK_SET); fread(&magic, uint32_size, 1, stream); rewind(stream); return magic; } int is_magic_macho(const uint32_t magic) { return magic == MH_MAGIC_64 || magic == MH_CIGAM_64 || magic == MH_MAGIC || magic == MH_CIGAM || magic == FAT_MAGIC || magic == FAT_CIGAM; } int is_magic_64(const uint32_t magic) { return magic == MH_MAGIC_64 || magic == MH_CIGAM_64; } int is_should_swap_bytes(const uint32_t magic) { return magic == MH_CIGAM || magic == MH_CIGAM_64 || magic == FAT_CIGAM; } int is_fat(const uint32_t magic) { return magic == FAT_MAGIC || magic == FAT_CIGAM; } struct fat_header *load_fat_header(FILE *stream, const int swap_bytes) { struct fat_header *header = malloc(fat_header_size); fread(header, fat_header_size, 1, stream); rewind(stream); if (swap_bytes) { swap_fat_header(header, 0); } return header; } struct fat_arch *load_fat_arch(FILE *stream, const int offset, const int swap_bytes) { struct fat_arch *arch = malloc(fat_arch_size); fseek(stream, offset, SEEK_SET); fread(arch, fat_arch_size, 1, stream); rewind(stream); if (swap_bytes) { swap_fat_arch(arch, 1, 0); } return arch; } uint32_t offset_for_arch(FILE *stream, const int index, const int swap_bytes) { int offset = fat_header_size + fat_arch_size * index; struct fat_arch *arch = load_fat_arch(stream, offset, swap_bytes); uint32_t arch_offset = arch->offset; free(arch); return arch_offset; } struct mach_header *load_mach_header(FILE *stream, const int offset, const int swap_bytes) { struct mach_header *header = malloc(mach_header_size); fseek(stream, offset, SEEK_SET); fread(header, mach_header_size, 1, stream); rewind(stream); if (swap_bytes) { swap_mach_header(header, 0); } return header; } struct mach_header_64 *load_mach_header_64(FILE *stream, const int offset, const int swap_bytes) { struct mach_header_64 *header = malloc(mach_header_64_size); fseek(stream, offset, SEEK_SET); fread(header, mach_header_64_size, 1, stream); rewind(stream); if (swap_bytes) { swap_mach_header_64(header, 0); } return header; } struct load_command *load_load_command(FILE *stream, const int offset, const int swap_bytes) { struct load_command *command = malloc(load_command_size); fseek(stream, offset, SEEK_SET); fread(command, load_command_size, 1, stream); rewind(stream); if (swap_bytes) { swap_load_command(command, 0); } return command; } struct segment_command *load_segment_command(FILE *stream, const int offset, const int swap_bytes) { struct segment_command *command = malloc(segment_command_size); fseek(stream, offset, SEEK_SET); fread(command, segment_command_size, 1, stream); rewind(stream); if (swap_bytes) { swap_segment_command(command, 0); } return command; } struct segment_command_64 *load_segment_command_64(FILE *stream, const int offset, const int swap_bytes) { struct segment_command_64 *command = malloc(segment_command_64_size); fseek(stream, offset, SEEK_SET); fread(command, segment_command_64_size, 1, stream); rewind(stream); if (swap_bytes) { swap_segment_command_64(command, 0); } return command; } struct segment_command *load_llvm_segment_command(FILE *stream, struct mach_header *header, const int offset, const int swap_bytes) { int cmd_offset = offset + mach_header_size; for (int i = 0; i < header->ncmds; i++) { struct load_command *cmd = load_load_command(stream, cmd_offset, swap_bytes); if (cmd->cmd == LC_SEGMENT) { struct segment_command *segment = load_segment_command(stream, cmd_offset, swap_bytes); if (!strncmp("__LLVM", segment->segname, 7)) { return segment; } free(segment); } cmd_offset += cmd->cmdsize; free(cmd); } return 0; } struct segment_command_64 *load_llvm_segment_command_64(FILE *stream, struct mach_header_64 *header, const int offset, const int swap_bytes) { int cmd_offset = offset + mach_header_64_size; for (int i = 0; i < header->ncmds; i++) { struct load_command *cmd = load_load_command(stream, cmd_offset, swap_bytes); if (cmd->cmd == LC_SEGMENT_64) { struct segment_command_64 *segment = load_segment_command_64(stream, cmd_offset, swap_bytes); if (!strncmp("__LLVM", segment->segname, 7)) { return segment; } free(segment); } cmd_offset += cmd->cmdsize; free(cmd); } return 0; } ================================================ FILE: macho_reader.h ================================================ #ifndef MACHO_READER_H #define MACHO_READER_H #ifdef __cplusplus extern "C" { #endif #include #include struct mach_header; struct mach_header_64; struct fat_header; int get_cpu_type_count(); const char *get_cpu_type_name(struct mach_header *header); const char *get_cpu_type_name_64(struct mach_header_64 *header); uint32_t get_magic(FILE *stream, const int offset); int is_magic_macho(const uint32_t magic); int is_magic_64(const uint32_t magic); int is_fat(const uint32_t magic); int is_should_swap_bytes(const uint32_t magic); struct fat_header *load_fat_header(FILE *stream, const int swap_bytes); uint32_t offset_for_arch(FILE *stream, const int index, const int swap_bytes); struct mach_header *load_mach_header(FILE *stream, const int offset, const int swap_bytes); struct mach_header_64 *load_mach_header_64(FILE *stream, const int offset, const int swap_bytes); struct segment_command *load_llvm_segment_command(FILE *stream, struct mach_header *header, const int offset, const int swap_bytes); struct segment_command_64 *load_llvm_segment_command_64(FILE *stream, struct mach_header_64 *header, const int offset, const int swap_bytes); #ifdef __cplusplus } #endif #endif ================================================ FILE: macho_retriever.c ================================================ #include "macho_reader.h" #include "macho_retriever.h" #include #include #include int max_number_of_archives() { return get_cpu_type_count(); } struct bitcode_archive *make_bitcode(FILE *stream, const char *cpuname, const uint64_t offset, const uint64_t size) { struct bitcode_archive *bitcode = malloc(sizeof(struct bitcode_archive)); bitcode->size = size; bitcode->buffer = malloc(sizeof(char) * size); fseek(stream, offset, SEEK_SET); fread(bitcode->buffer, sizeof(char), size, stream); bitcode->cpu = cpuname; return bitcode; } struct bitcode_archive *extract_bitcode(FILE *stream, const int offset, const int swap_bytes) { struct mach_header *header = load_mach_header(stream, offset, swap_bytes); const char *cpu_name = get_cpu_type_name(header); struct segment_command *segment = load_llvm_segment_command(stream, header, offset, swap_bytes); if (!segment) { free(header); return NULL; } struct bitcode_archive *bitcode = make_bitcode(stream, cpu_name, offset + segment->fileoff, segment->filesize); free(segment); free(header); return bitcode; } struct bitcode_archive *extract_bitcode_64(FILE *stream, const int offset, const int swap_bytes) { struct mach_header_64 *header = load_mach_header_64(stream, offset, swap_bytes); const char *cpu_name = get_cpu_type_name_64(header); struct segment_command_64 *segment = load_llvm_segment_command_64(stream, header, offset, swap_bytes); if (!segment) { free(header); return NULL; } struct bitcode_archive *bitcode = make_bitcode(stream, cpu_name, offset + segment->fileoff, segment->filesize); free(segment); free(header); return bitcode; } struct bitcode_archive *retrieve_bitcode_from_nonfat(FILE *stream, const uint32_t offset) { uint32_t magic = get_magic(stream, offset); int is64 = is_magic_64(magic); int swap_bytes = is_should_swap_bytes(magic); if (is64) { return extract_bitcode_64(stream, offset, swap_bytes); } else { return extract_bitcode(stream, offset, swap_bytes); } } int is_macho(FILE *stream) { uint32_t magic = get_magic(stream, 0); return is_magic_macho(magic); } void retrieve_bitcode(FILE *stream, struct bitcode_archive *bitcodes[], int *count) { uint32_t magic = get_magic(stream, 0); if (is_fat(magic)) { int swap_bytes = is_should_swap_bytes(magic); struct fat_header *header = load_fat_header(stream, swap_bytes); *count = header->nfat_arch; for (int i = 0; i < *count; i++) { uint32_t offset = offset_for_arch(stream, i, swap_bytes); bitcodes[i] = retrieve_bitcode_from_nonfat(stream, offset); } free(header); } else { bitcodes[0] = retrieve_bitcode_from_nonfat(stream, 0); *count = 1; } } ================================================ FILE: macho_retriever.h ================================================ #ifndef MACHO_RETRIEVER_H #define MACHO_RETRIEVER_H #ifdef __cplusplus extern "C" { #endif #include #include struct bitcode_archive { uint64_t size; char* buffer; const char* cpu; }; struct bitcode_archive* make_bitcode(FILE* stream, const char* cpuname, const uint64_t offset, const uint64_t size); struct bitcode_archive* extract_bitcode(FILE* stream, const int offset, const int swap_bytes); struct bitcode_archive* extract_bitcode_64(FILE* stream, const int offset, const int swap_bytes); struct bitcode_archive* retrieve_bitcode_from_nonfat(FILE* stream, const uint32_t offset); int max_number_of_archives(); int is_macho(FILE* stream); void retrieve_bitcode(FILE* stream, struct bitcode_archive* bitcodes[], int* count); #ifdef __cplusplus } #endif #endif ================================================ FILE: macho_util.c ================================================ #include "macho_reader.h" #include "macho_retriever.h" #include "macho_util.h" #include #include #include #include #include char *fname(const char *name, const char *ext) { const char *delimiter = "."; int length = strlen(name) + strlen(delimiter) + strlen(ext) + 1; char *filename = calloc(sizeof(char), length); strcpy(filename, name); strcat(filename, delimiter); strcat(filename, ext); return filename; } char *write_to_xar(struct bitcode_archive *bitcode) { char *filename = fname(bitcode->cpu, "xar"); FILE *output = fopen(filename, "wb"); if (!output) { fprintf(stderr, "Cannot open '%s' for writing\n", filename); free(filename); return NULL; } fwrite(bitcode->buffer, sizeof(char), bitcode->size, output); fclose(output); return filename; } int extract_xar(const char *path, const char *cpu, char *files[], int *count) { xar_t x; xar_iter_t xi; xar_file_t xf; xar_stream xs; char buffer[8192]; x = xar_open(path, READ); if (!x) { fprintf(stderr, "Error opening archive\n"); return 1; } xi = xar_iter_new(); if (!xi) { fprintf(stderr, "Error creating xar iterator\n"); return 2; } *count = 0; for (xf = xar_file_first(x, xi); xf; xf = xar_file_next(xi)) { char *path = xar_get_path(xf); const char *type; xar_prop_get(xf, "type", &type); if (!type) { fprintf(stderr, "File has no type %s\n", path); free(path); continue; } if (strcmp(type, "file") != 0) { fprintf(stderr, "Skipping %s\n", path); free(path); continue; } if (xar_extract_tostream_init(x, xf, &xs) != XAR_STREAM_OK) { fprintf(stderr, "Error initializing stream %s\n", path); free(path); continue; } char *prefix = fname(cpu, path); char *output_path = fname(prefix, "bc"); free(path); free(prefix); FILE *output = fopen(output_path, "wb"); if (!output) { fprintf(stderr, "Error opening output file %s\n", output_path); free(output_path); continue; } xs.avail_out = sizeof(buffer); xs.next_out = buffer; int32_t ret; while ((ret = xar_extract_tostream(&xs)) != XAR_STREAM_END) { if (ret == XAR_STREAM_ERR) { fprintf(stderr, "Error extracting stream %s\n", output_path); free(output_path); return 3; } fwrite(buffer, sizeof(char), sizeof(buffer) - xs.avail_out, output); xs.avail_out = sizeof(buffer); xs.next_out = buffer; } if (xar_extract_tostream_end(&xs) != XAR_STREAM_OK) { fprintf(stderr, "Error ending stream %s\n", output_path); } fclose(output); files[(*count)++] = output_path; } return 0; } int write_to_bitcode(struct bitcode_archive *bitcode, char *files[], int *count) { char *xar_file = write_to_xar(bitcode); int extracted = extract_xar(xar_file, bitcode->cpu, files, count); if (extracted != 0) { fprintf(stderr, "Error extracting xar file %s\n", xar_file); free(xar_file); return 1; } int removed = remove(xar_file); if (removed != 0) { fprintf(stderr, "Error removing xar file %s\n", xar_file); free(xar_file); return 2; } free(xar_file); return 0; } int get_options(xmlNode *option_parent, char *options[], int *size) { xmlNode *cur_node = NULL; *size = 0; for (cur_node = option_parent; cur_node; cur_node = cur_node->next) { if (cur_node->type == XML_ELEMENT_NODE && strcmp((const char *)cur_node->name, "option") == 0) { char *content = (char *)xmlNodeGetContent(cur_node); options[(*size)++] = content; } } return 0; } int get_linker_options(xmlNode *a_node, char *options[], int *size) { xmlNode *cur_node = NULL; for (cur_node = a_node; cur_node; cur_node = cur_node->next) { if (cur_node->type == XML_ELEMENT_NODE) { if (strcmp((const char *)cur_node->name, "link-options") == 0) { return get_options(cur_node->children, options, size); } else { get_linker_options(cur_node->children, options, size); } } } return 1; } int retrieve_toc(const char *xar_path, const char *toc_path) { xar_t x; x = xar_open(xar_path, READ); if (!x) { fprintf(stderr, "Error opening xar archive %s\n", xar_path); return 1; } xar_serialize(x, toc_path); return 0; } int retrieve_linker_options(const char *xar_path, char *options[], int *size) { const char *toc_file = "toc.temp"; retrieve_toc(xar_path, toc_file); xmlDoc *doc = NULL; doc = xmlReadFile(toc_file, NULL, 0); if (doc == NULL) { fprintf(stderr, "Cannot parse TOC %s\n", toc_file); remove(toc_file); return 1; } get_linker_options(xmlDocGetRootElement(doc), options, size); xmlFreeDoc(doc); xmlCleanupParser(); remove(toc_file); return 0; } ================================================ FILE: macho_util.h ================================================ #ifndef MACHO_UTIL_H #define MACHO_UTIL_H #ifdef __cplusplus extern "C" { #endif #include #include "macho_retriever.h" char *fname(const char *name, const char *ext); char *write_to_xar(struct bitcode_archive *bitcode); int extract_xar(const char *path, const char *cpu, char *files[], int *count); int write_to_bitcode(struct bitcode_archive *bitcode, char *files[], int *count); int get_options(xmlNode *option_parent, char *options[], int *size); int get_linker_options(xmlNode *a_node, char *options[], int *size); int retrieve_toc(const char *xar_path, const char *toc_path); int retrieve_linker_options(const char *xar_path, char *options[], int *size); #ifdef __cplusplus } #endif #endif ================================================ FILE: main.c ================================================ #include "macho_retriever.h" #include "macho_util.h" #include #include int main(int argc, char *argv[]) { int extract = 0; int linker_options = 0; int c; while ((c = getopt(argc, argv, "el")) != -1) { switch (c) { case 'e': extract = 1; break; case 'l': linker_options = 1; break; default: fprintf(stderr, "Unknown option `-%c'.\n", optopt); exit(1); } } if(optind >= argc) { fprintf(stderr, "No file provided.\n"); } char *filename = argv[optind]; FILE *stream = fopen(filename, "rb"); struct bitcode_archive *archives[max_number_of_archives()]; int archive_count; retrieve_bitcode(stream, archives, &archive_count); for (int i = 0; i < archive_count; i++) { if (archives[i]) { if (extract) { char *bitcode_files[1024]; int bitcode_count; write_to_bitcode(archives[i], bitcode_files, &bitcode_count); for (int j = 0; j < bitcode_count; j++) { printf("%s\n", bitcode_files[j]); free(bitcode_files[j]); } } if(!extract || linker_options) { char *xar_name = write_to_xar(archives[i]); printf("%s\n", xar_name); if (linker_options) { char *options[128]; int size = 0; retrieve_linker_options(xar_name, options, &size); printf("Linker options: "); for (int i = 0; i < size; i++) { printf("%s ", options[i]); free(options[i]); } printf("\n"); } free(xar_name); } free(archives[i]->buffer); free(archives[i]); } } fclose(stream); return 0; } ================================================ FILE: subject/main.c ================================================ #include extern int power2(int x); int main() { printf("%d\n", power2(4)); return 0; } ================================================ FILE: subject/power2.c ================================================ int power2(int x) { return x * x; }