Repository: matenure/FastGCN Branch: master Commit: b8e6e6412d8c Files: 48 Total size: 171.8 KB Directory structure: gitextract_xo97i8qd/ ├── README.md ├── __init__.py ├── create_Graph.py ├── create_Graph_forGraphSAGE.py ├── data/ │ ├── ind.citeseer.allx │ ├── ind.citeseer.ally │ ├── ind.citeseer.graph │ ├── ind.citeseer.test.index │ ├── ind.citeseer.tx │ ├── ind.citeseer.ty │ ├── ind.citeseer.x │ ├── ind.citeseer.y │ ├── ind.cora.allx │ ├── ind.cora.ally │ ├── ind.cora.graph │ ├── ind.cora.test.index │ ├── ind.cora.tx │ ├── ind.cora.ty │ ├── ind.cora.x │ ├── ind.cora.y │ ├── ind.pubmed.allx │ ├── ind.pubmed.ally │ ├── ind.pubmed.graph │ ├── ind.pubmed.test.index │ ├── ind.pubmed.tx │ ├── ind.pubmed.ty │ ├── ind.pubmed.x │ └── ind.pubmed.y ├── inits.py ├── lanczos.py ├── layers.py ├── metrics.py ├── models.py ├── pubmed-original_inductive_FastGCN.py ├── pubmed-original_transductive_FastGCN.py ├── pubmed_Mix.py ├── pubmed_Mix_sampleA.py ├── pubmed_Mix_uniform.py ├── pubmed_inductive_appr2layers.py ├── train.py ├── train_batch_multiRank_inductive_newscheme.py ├── train_batch_multiRank_inductive_reddit_Mixlayers_sampleA.py ├── train_batch_multiRank_inductive_reddit_Mixlayers_sampleBatch.py ├── train_batch_multiRank_inductive_reddit_Mixlayers_uniform.py ├── train_batch_multiRank_inductive_reddit_appr2layers.py ├── train_batch_multiRank_inductive_reddit_onelayer.py ├── transformRedditGraph2NPZ.py └── utils.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: README.md ================================================ # FastGCN This is the Tensorflow implementation of our ICLR2018 paper: ["**FastGCN: Fast Learning with Graph Convolutional Networks via Importance Sampling**".](https://openreview.net/forum?id=rytstxWAW¬eId=ByU9EpGSf) Instructions of the sample codes: [For Reddit dataset] train_batch_multiRank_inductive_reddit_Mixlayers_sampleA.py is the final model. (precomputated the AH in the bottom layer) The original Reddit data should be transferred into the .npz format using this function: transferRedditDataFormat. Note: By default, this code does no sampling. To enable sampling, change `main(None)` at the bottom to `main(100)`. (The number is the sample size. You can also try other sample sizes) train_batch_multiRank_inductive_reddit_Mixlayers_uniform.py is the model for uniform sampling. train_batch_multiRank_inductive_reddit_Mixlayers_appr2layers.py is the model for 2-layer approximation. create_Graph_forGraphSAGE.py is used to transfer the data into the GraphSAGE format, so that users can compare our method with GraphSAGE. We also include the transferred original Cora dataset in this repository (./data/cora_graphSAGE). [For pubmed or cora] train.py is the original GCN model. pubmed_Mix_sampleA.py The dataset could be defined in the codes, for example: flags.DEFINE_string('dataset', 'pubmed', 'Dataset string.') pubmed_Mix_uniform.py and pubmed_inductive_appr2layers.py are similar to the ones for reddit. pubmed-original**.py means the codes are used for original Cora or Pubmed datasets. Users could also change their datasets by changing the data load function from load_data() to load_data_original(). ================================================ FILE: __init__.py ================================================ from __future__ import print_function from __future__ import division ================================================ FILE: create_Graph.py ================================================ import numpy as np import pickle as pkl import scipy.sparse as sp import sys import os import networkx as nx from utils import * import json from networkx.readwrite import json_graph # 'cora', 'citeseer', 'pubmed' if __name__=="__main__": data_name = 'cora' adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask = load_data(data_name) G = nx.from_scipy_sparse_matrix(adj) val_index = np.where(val_mask)[0] test_index = np.where(test_mask)[0] y = y_train+y_val+y_test y = np.argmax(y,axis=1) for i in range(len(y)): if i in val_index: G.node[i]['val']=True G.node[i]['test']=False elif i in test_index: G.node[i]['test']=True G.node[i]['val']=False else: G.node[i]['test'] = False G.node[i]['val'] = False data = json_graph.node_link_data(G) with open("cora/cora-G.json","wb") as f: json.dump(data,f) classMap = {} idMap = {} for i in range(len(y)): classMap[i]=y[i] idMap[i] = i with open("cora/cora-id_map.json","wb") as f: json.dump(idMap,f) with open("cora/cora-class_map.json","wb") as f: json.dump(classMap,f) np.save(open("cora/cora-feats.npy","wb"), features.todense()) ================================================ FILE: create_Graph_forGraphSAGE.py ================================================ import numpy as np import pickle as pkl import scipy.sparse as sp import sys import os import networkx as nx from utils import * import json from networkx.readwrite import json_graph # 'cora', 'citeseer', 'pubmed' if __name__=="__main__": data_name = 'cora' adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask = load_data_original(data_name) G = nx.from_scipy_sparse_matrix(adj) train_index = np.where(train_mask)[0] val_index = np.where(val_mask)[0] test_index = np.where(test_mask)[0] y = y_train+y_val+y_test y = np.argmax(y,axis=1) train_attr, val_attr, test_attr = ({i: bool(m) for i, m in enumerate(mask)} for mask in (train_mask, val_mask, test_mask)) nx.set_node_attributes(G, train_attr, 'train') nx.set_node_attributes(G, val_attr, 'val') nx.set_node_attributes(G, test_attr, 'test') data = json_graph.node_link_data(G) with open("%s/%s0-G.json" % (data_name, data_name), "wb") as f: json.dump(data,f) classMap = {} idMap = {} for i in range(len(y)): classMap[i]=y[i] idMap[i] = i with open("%s/%s0-id_map.json" % (data_name, data_name), "wb") as f: json.dump(idMap,f) with open("%s/%s0-class_map.json" % (data_name, data_name), "wb") as f: json.dump(classMap,f) np.save("%s/%s0-feats.npy" % (data_name, data_name), features.todense()) ================================================ FILE: data/ind.citeseer.test.index ================================================ 2488 2644 3261 2804 3176 2432 3310 2410 2812 2520 2994 3282 2680 2848 2670 3005 2977 2592 2967 2461 3184 2852 2768 2905 2851 3129 3164 2438 2793 2763 2528 2954 2347 2640 3265 2874 2446 2856 3149 2374 3097 3301 2664 2418 2655 2464 2596 3262 3278 2320 2612 2614 2550 2626 2772 3007 2733 2516 2476 2798 2561 2839 2685 2391 2705 3098 2754 3251 2767 2630 2727 2513 2701 3264 2792 2821 3260 2462 3307 2639 2900 3060 2672 3116 2731 3316 2386 2425 2518 3151 2586 2797 2479 3117 2580 3182 2459 2508 3052 3230 3215 2803 2969 2562 2398 3325 2343 3030 2414 2776 2383 3173 2850 2499 3312 2648 2784 2898 3056 2484 3179 3132 2577 2563 2867 3317 2355 3207 3178 2968 3319 2358 2764 3001 2683 3271 2321 2567 2502 3246 2715 3066 2390 2381 3162 2741 2498 2790 3038 3321 2481 3050 3161 3122 2801 2957 3177 2965 2621 3208 2921 2802 2357 2677 2519 2860 2696 2368 3241 2858 2419 2762 2875 3222 3064 2827 3044 2471 3062 2982 2736 2322 2709 2766 2424 2602 2970 2675 3299 2554 2964 2597 2753 2979 2523 2912 2896 2317 3167 2813 2482 2557 3043 3244 2985 2460 2363 3272 3045 3192 2453 2656 2834 2443 3202 2926 2711 2633 2384 2752 3285 2817 2483 2919 2924 2661 2698 2361 2662 2819 3143 2316 3196 2739 2345 2578 2822 3229 2908 2917 2692 3200 2324 2522 3322 2697 3163 3093 3233 2774 2371 2835 2652 2539 2843 3231 2976 2429 2367 3144 2564 3283 3217 3035 2962 2433 2415 2387 3021 2595 2517 2468 3061 2673 2348 3027 2467 3318 2959 3273 2392 2779 2678 3004 2634 2974 3198 2342 2376 3249 2868 2952 2710 2838 2335 2524 2650 3186 2743 2545 2841 2515 2505 3181 2945 2738 2933 3303 2611 3090 2328 3010 3016 2504 2936 3266 3253 2840 3034 2581 2344 2452 2654 3199 3137 2514 2394 2544 2641 2613 2618 2558 2593 2532 2512 2975 3267 2566 2951 3300 2869 2629 2747 3055 2831 3105 3168 3100 2431 2828 2684 3269 2910 2865 2693 2884 3228 2783 3247 2770 3157 2421 2382 2331 3203 3240 2351 3114 2986 2688 2439 2996 3079 3103 3296 2349 2372 3096 2422 2551 3069 2737 3084 3304 3022 2542 3204 2949 2318 2450 3140 2734 2881 2576 3054 3089 3125 2761 3136 3111 2427 2466 3101 3104 3259 2534 2961 3191 3000 3036 2356 2800 3155 3224 2646 2735 3020 2866 2426 2448 3226 3219 2749 3183 2906 2360 2440 2946 2313 2859 2340 3008 2719 3058 2653 3023 2888 3243 2913 3242 3067 2409 3227 2380 2353 2686 2971 2847 2947 2857 3263 3218 2861 3323 2635 2966 2604 2456 2832 2694 3245 3119 2942 3153 2894 2555 3128 2703 2323 2631 2732 2699 2314 2590 3127 2891 2873 2814 2326 3026 3288 3095 2706 2457 2377 2620 2526 2674 3190 2923 3032 2334 3254 2991 3277 2973 2599 2658 2636 2826 3148 2958 3258 2990 3180 2538 2748 2625 2565 3011 3057 2354 3158 2622 3308 2983 2560 3169 3059 2480 3194 3291 3216 2643 3172 2352 2724 2485 2411 2948 2445 2362 2668 3275 3107 2496 2529 2700 2541 3028 2879 2660 3324 2755 2436 3048 2623 2920 3040 2568 3221 3003 3295 2473 3232 3213 2823 2897 2573 2645 3018 3326 2795 2915 3109 3086 2463 3118 2671 2909 2393 2325 3029 2972 3110 2870 3284 2816 2647 2667 2955 2333 2960 2864 2893 2458 2441 2359 2327 3256 3099 3073 3138 2511 2666 2548 2364 2451 2911 3237 3206 3080 3279 2934 2981 2878 3130 2830 3091 2659 2449 3152 2413 2722 2796 3220 2751 2935 3238 2491 2730 2842 3223 2492 3074 3094 2833 2521 2883 3315 2845 2907 3083 2572 3092 2903 2918 3039 3286 2587 3068 2338 3166 3134 2455 2497 2992 2775 2681 2430 2932 2931 2434 3154 3046 2598 2366 3015 3147 2944 2582 3274 2987 2642 2547 2420 2930 2750 2417 2808 3141 2997 2995 2584 2312 3033 3070 3065 2509 3314 2396 2543 2423 3170 2389 3289 2728 2540 2437 2486 2895 3017 2853 2406 2346 2877 2472 3210 2637 2927 2789 2330 3088 3102 2616 3081 2902 3205 3320 3165 2984 3185 2707 3255 2583 2773 2742 3024 2402 2718 2882 2575 3281 2786 2855 3014 2401 2535 2687 2495 3113 2609 2559 2665 2530 3293 2399 2605 2690 3133 2799 2533 2695 2713 2886 2691 2549 3077 3002 3049 3051 3087 2444 3085 3135 2702 3211 3108 2501 2769 3290 2465 3025 3019 2385 2940 2657 2610 2525 2941 3078 2341 2916 2956 2375 2880 3009 2780 2370 2925 2332 3146 2315 2809 3145 3106 2782 2760 2493 2765 2556 2890 2400 2339 3201 2818 3248 3280 2570 2569 2937 3174 2836 2708 2820 3195 2617 3197 2319 2744 2615 2825 2603 2914 2531 3193 2624 2365 2810 3239 3159 2537 2844 2758 2938 3037 2503 3297 2885 2608 2494 2712 2408 2901 2704 2536 2373 2478 2723 3076 2627 2369 2669 3006 2628 2788 3276 2435 3139 3235 2527 2571 2815 2442 2892 2978 2746 3150 2574 2725 3188 2601 2378 3075 2632 2794 3270 3071 2506 3126 3236 3257 2824 2989 2950 2428 2405 3156 2447 2787 2805 2720 2403 2811 2329 2474 2785 2350 2507 2416 3112 2475 2876 2585 2487 3072 3082 2943 2757 2388 2600 3294 2756 3142 3041 2594 2998 3047 2379 2980 2454 2862 3175 2588 3031 3012 2889 2500 2791 2854 2619 2395 2807 2740 2412 3131 3013 2939 2651 2490 2988 2863 3225 2745 2714 3160 3124 2849 2676 2872 3287 3189 2716 3115 2928 2871 2591 2717 2546 2777 3298 2397 3187 2726 2336 3268 2477 2904 2846 3121 2899 2510 2806 2963 3313 2679 3302 2663 3053 2469 2999 3311 2470 2638 3120 3171 2689 2922 2607 2721 2993 2887 2837 2929 2829 3234 2649 2337 2759 2778 2771 2404 2589 3123 3209 2729 3252 2606 2579 2552 ================================================ FILE: data/ind.cora.test.index ================================================ 2692 2532 2050 1715 2362 2609 2622 1975 2081 1767 2263 1725 2588 2259 2357 1998 2574 2179 2291 2382 1812 1751 2422 1937 2631 2510 2378 2589 2345 1943 1850 2298 1825 2035 2507 2313 1906 1797 2023 2159 2495 1886 2122 2369 2461 1925 2565 1858 2234 2000 1846 2318 1723 2559 2258 1763 1991 1922 2003 2662 2250 2064 2529 1888 2499 2454 2320 2287 2203 2018 2002 2632 2554 2314 2537 1760 2088 2086 2218 2605 1953 2403 1920 2015 2335 2535 1837 2009 1905 2636 1942 2193 2576 2373 1873 2463 2509 1954 2656 2455 2494 2295 2114 2561 2176 2275 2635 2442 2704 2127 2085 2214 2487 1739 2543 1783 2485 2262 2472 2326 1738 2170 2100 2384 2152 2647 2693 2376 1775 1726 2476 2195 1773 1793 2194 2581 1854 2524 1945 1781 1987 2599 1744 2225 2300 1928 2042 2202 1958 1816 1916 2679 2190 1733 2034 2643 2177 1883 1917 1996 2491 2268 2231 2471 1919 1909 2012 2522 1865 2466 2469 2087 2584 2563 1924 2143 1736 1966 2533 2490 2630 1973 2568 1978 2664 2633 2312 2178 1754 2307 2480 1960 1742 1962 2160 2070 2553 2433 1768 2659 2379 2271 1776 2153 1877 2027 2028 2155 2196 2483 2026 2158 2407 1821 2131 2676 2277 2489 2424 1963 1808 1859 2597 2548 2368 1817 2405 2413 2603 2350 2118 2329 1969 2577 2475 2467 2425 1769 2092 2044 2586 2608 1983 2109 2649 1964 2144 1902 2411 2508 2360 1721 2005 2014 2308 2646 1949 1830 2212 2596 1832 1735 1866 2695 1941 2546 2498 2686 2665 1784 2613 1970 2021 2211 2516 2185 2479 2699 2150 1990 2063 2075 1979 2094 1787 2571 2690 1926 2341 2566 1957 1709 1955 2570 2387 1811 2025 2447 2696 2052 2366 1857 2273 2245 2672 2133 2421 1929 2125 2319 2641 2167 2418 1765 1761 1828 2188 1972 1997 2419 2289 2296 2587 2051 2440 2053 2191 1923 2164 1861 2339 2333 2523 2670 2121 1921 1724 2253 2374 1940 2545 2301 2244 2156 1849 2551 2011 2279 2572 1757 2400 2569 2072 2526 2173 2069 2036 1819 1734 1880 2137 2408 2226 2604 1771 2698 2187 2060 1756 2201 2066 2439 1844 1772 2383 2398 1708 1992 1959 1794 2426 2702 2444 1944 1829 2660 2497 2607 2343 1730 2624 1790 1935 1967 2401 2255 2355 2348 1931 2183 2161 2701 1948 2501 2192 2404 2209 2331 1810 2363 2334 1887 2393 2557 1719 1732 1986 2037 2056 1867 2126 1932 2117 1807 1801 1743 2041 1843 2388 2221 1833 2677 1778 2661 2306 2394 2106 2430 2371 2606 2353 2269 2317 2645 2372 2550 2043 1968 2165 2310 1985 2446 1982 2377 2207 1818 1913 1766 1722 1894 2020 1881 2621 2409 2261 2458 2096 1712 2594 2293 2048 2359 1839 2392 2254 1911 2101 2367 1889 1753 2555 2246 2264 2010 2336 2651 2017 2140 1842 2019 1890 2525 2134 2492 2652 2040 2145 2575 2166 1999 2434 1711 2276 2450 2389 2669 2595 1814 2039 2502 1896 2168 2344 2637 2031 1977 2380 1936 2047 2460 2102 1745 2650 2046 2514 1980 2352 2113 1713 2058 2558 1718 1864 1876 2338 1879 1891 2186 2451 2181 2638 2644 2103 2591 2266 2468 1869 2582 2674 2361 2462 1748 2215 2615 2236 2248 2493 2342 2449 2274 1824 1852 1870 2441 2356 1835 2694 2602 2685 1893 2544 2536 1994 1853 1838 1786 1930 2539 1892 2265 2618 2486 2583 2061 1796 1806 2084 1933 2095 2136 2078 1884 2438 2286 2138 1750 2184 1799 2278 2410 2642 2435 1956 2399 1774 2129 1898 1823 1938 2299 1862 2420 2673 1984 2204 1717 2074 2213 2436 2297 2592 2667 2703 2511 1779 1782 2625 2365 2315 2381 1788 1714 2302 1927 2325 2506 2169 2328 2629 2128 2655 2282 2073 2395 2247 2521 2260 1868 1988 2324 2705 2541 1731 2681 2707 2465 1785 2149 2045 2505 2611 2217 2180 1904 2453 2484 1871 2309 2349 2482 2004 1965 2406 2162 1805 2654 2007 1947 1981 2112 2141 1720 1758 2080 2330 2030 2432 2089 2547 1820 1815 2675 1840 2658 2370 2251 1908 2029 2068 2513 2549 2267 2580 2327 2351 2111 2022 2321 2614 2252 2104 1822 2552 2243 1798 2396 2663 2564 2148 2562 2684 2001 2151 2706 2240 2474 2303 2634 2680 2055 2090 2503 2347 2402 2238 1950 2054 2016 1872 2233 1710 2032 2540 2628 1795 2616 1903 2531 2567 1946 1897 2222 2227 2627 1856 2464 2241 2481 2130 2311 2083 2223 2284 2235 2097 1752 2515 2527 2385 2189 2283 2182 2079 2375 2174 2437 1993 2517 2443 2224 2648 2171 2290 2542 2038 1855 1831 1759 1848 2445 1827 2429 2205 2598 2657 1728 2065 1918 2427 2573 2620 2292 1777 2008 1875 2288 2256 2033 2470 2585 2610 2082 2230 1915 1847 2337 2512 2386 2006 2653 2346 1951 2110 2639 2520 1939 2683 2139 2220 1910 2237 1900 1836 2197 1716 1860 2077 2519 2538 2323 1914 1971 1845 2132 1802 1907 2640 2496 2281 2198 2416 2285 1755 2431 2071 2249 2123 1727 2459 2304 2199 1791 1809 1780 2210 2417 1874 1878 2116 1961 1863 2579 2477 2228 2332 2578 2457 2024 1934 2316 1841 1764 1737 2322 2239 2294 1729 2488 1974 2473 2098 2612 1834 2340 2423 2175 2280 2617 2208 2560 1741 2600 2059 1747 2242 2700 2232 2057 2147 2682 1792 1826 2120 1895 2364 2163 1851 2391 2414 2452 1803 1989 2623 2200 2528 2415 1804 2146 2619 2687 1762 2172 2270 2678 2593 2448 1882 2257 2500 1899 2478 2412 2107 1746 2428 2115 1800 1901 2397 2530 1912 2108 2206 2091 1740 2219 1976 2099 2142 2671 2668 2216 2272 2229 2666 2456 2534 2697 2688 2062 2691 2689 2154 2590 2626 2390 1813 2067 1952 2518 2358 1789 2076 2049 2119 2013 2124 2556 2105 2093 1885 2305 2354 2135 2601 1770 1995 2504 1749 2157 ================================================ FILE: data/ind.pubmed.test.index ================================================ 18747 19392 19181 18843 19221 18962 19560 19097 18966 19014 18756 19313 19000 19569 19359 18854 18970 19073 19661 19180 19377 18750 19401 18788 19224 19447 19017 19241 18890 18908 18965 19001 18849 19641 18852 19222 19172 18762 19156 19162 18856 18763 19318 18826 19712 19192 19695 19030 19523 19249 19079 19232 19455 18743 18800 19071 18885 19593 19394 19390 18832 19445 18838 19632 19548 19546 18825 19498 19266 19117 19595 19252 18730 18913 18809 19452 19520 19274 19555 19388 18919 19099 19637 19403 18720 19526 18905 19451 19408 18923 18794 19322 19431 18912 18841 19239 19125 19258 19565 18898 19482 19029 18778 19096 19684 19552 18765 19361 19171 19367 19623 19402 19327 19118 18888 18726 19510 18831 19490 19576 19050 18729 18896 19246 19012 18862 18873 19193 19693 19474 18953 19115 19182 19269 19116 18837 18872 19007 19212 18798 19102 18772 19660 19511 18914 18886 19672 19360 19213 18810 19420 19512 18719 19432 19350 19127 18782 19587 18924 19488 18781 19340 19190 19383 19094 18835 19487 19230 18791 18882 18937 18928 18755 18802 19516 18795 18786 19273 19349 19398 19626 19130 19351 19489 19446 18959 19025 18792 18878 19304 19629 19061 18785 19194 19179 19210 19417 19583 19415 19443 18739 19662 18904 18910 18901 18960 18722 18827 19290 18842 19389 19344 18961 19098 19147 19334 19358 18829 18984 18931 18742 19320 19111 19196 18887 18991 19469 18990 18876 19261 19270 19522 19088 19284 19646 19493 19225 19615 19449 19043 19674 19391 18918 19155 19110 18815 19131 18834 19715 19603 19688 19133 19053 19166 19066 18893 18757 19582 19282 19257 18869 19467 18954 19371 19151 19462 19598 19653 19187 19624 19564 19534 19581 19478 18985 18746 19342 18777 19696 18824 19138 18728 19643 19199 18731 19168 18948 19216 19697 19347 18808 18725 19134 18847 18828 18996 19106 19485 18917 18911 18776 19203 19158 18895 19165 19382 18780 18836 19373 19659 18947 19375 19299 18761 19366 18754 19248 19416 19658 19638 19034 19281 18844 18922 19491 19272 19341 19068 19332 19559 19293 18804 18933 18935 19405 18936 18945 18943 18818 18797 19570 19464 19428 19093 19433 18986 19161 19255 19157 19046 19292 19434 19298 18724 19410 19694 19214 19640 19189 18963 19218 19585 19041 19550 19123 19620 19376 19561 18944 19706 19056 19283 18741 19319 19144 19542 18821 19404 19080 19303 18793 19306 19678 19435 19519 19566 19278 18946 19536 19020 19057 19198 19333 19649 19699 19399 19654 19136 19465 19321 19577 18907 19665 19386 19596 19247 19473 19568 19355 18925 19586 18982 19616 19495 19612 19023 19438 18817 19692 19295 19414 19676 19472 19107 19062 19035 18883 19409 19052 19606 19091 19651 19475 19413 18796 19369 19639 19701 19461 19645 19251 19063 19679 19545 19081 19363 18995 19549 18790 18855 18833 18899 19395 18717 19647 18768 19103 19245 18819 18779 19656 19076 18745 18971 19197 19711 19074 19128 19466 19139 19309 19324 18814 19092 19627 19060 18806 18929 18737 18942 18906 18858 19456 19253 19716 19104 19667 19574 18903 19237 18864 19556 19364 18952 19008 19323 19700 19170 19267 19345 19238 18909 18892 19109 19704 18902 19275 19680 18723 19242 19112 19169 18956 19343 19650 19541 19698 19521 19087 18976 19038 18775 18968 19671 19412 19407 19573 19027 18813 19357 19460 19673 19481 19036 19614 18787 19195 18732 18884 19613 19657 19575 19226 19589 19234 19617 19707 19484 18740 19424 18784 19419 19159 18865 19105 19315 19480 19664 19378 18803 19605 18870 19042 19426 18848 19223 19509 19532 18752 19691 18718 19209 19362 19090 19492 19567 19687 19018 18830 19530 19554 19119 19442 19558 19527 19427 19291 19543 19422 19142 18897 18950 19425 19002 19588 18978 19551 18930 18736 19101 19215 19150 19263 18949 18974 18759 19335 19200 19129 19328 19437 18988 19429 19368 19406 19049 18811 19296 19256 19385 19602 18770 19337 19580 19476 19045 19132 19089 19120 19265 19483 18767 19227 18934 19069 18820 19006 19459 18927 19037 19280 19441 18823 19015 19114 19618 18957 19176 18853 19648 19201 19444 19279 18751 19302 19505 18733 19601 19533 18863 19708 19387 19346 19152 19206 18851 19338 19681 19380 19055 18766 19085 19591 19547 18958 19146 18840 19051 19021 19207 19235 19086 18979 19300 18939 19100 19619 19287 18980 19277 19326 19108 18920 19625 19374 19078 18734 19634 19339 18877 19423 19652 19683 19044 18983 19330 19529 19714 19468 19075 19540 18839 19022 19286 19537 19175 19463 19167 19705 19562 19244 19486 19611 18801 19178 19590 18846 19450 19205 19381 18941 19670 19185 19504 19633 18997 19113 19397 19636 19709 19289 19264 19353 19584 19126 18938 19669 18964 19276 18774 19173 19231 18973 18769 19064 19040 19668 18738 19082 19655 19236 19352 19609 19628 18951 19384 19122 18875 18992 18753 19379 19254 19301 19506 19135 19010 19682 19400 19579 19316 19553 19208 19635 19644 18891 19024 18989 19250 18850 19317 18915 19607 18799 18881 19479 19031 19365 19164 18744 18760 19502 19058 19517 18735 19448 19243 19453 19285 18857 19439 19016 18975 19503 18998 18981 19186 18994 19240 19631 19070 19174 18900 19065 19220 19229 18880 19308 19372 19496 18771 19325 19538 19033 18874 19077 19211 18764 19458 19571 19121 19019 19059 19497 18969 19666 19297 19219 19622 19184 18977 19702 19539 19329 19095 19675 18972 19514 19703 19188 18866 18812 19314 18822 18845 19494 19411 18916 19686 18967 19294 19143 19204 18805 19689 19233 18758 18748 19011 19685 19336 19608 19454 19124 18868 18807 19544 19621 19228 19154 19141 19145 19153 18860 19163 19393 19268 19160 19305 19259 19471 19524 18783 19396 18894 19430 19690 19348 19597 19592 19677 18889 19331 18773 19137 19009 18932 19599 18816 19054 19067 19477 19191 18921 18940 19578 19183 19004 19072 19710 19005 19610 18955 19457 19148 18859 18993 19642 19047 19418 19535 19600 19312 19039 19028 18879 19003 19026 19013 19149 19177 19217 18987 19354 19525 19202 19084 19032 18749 18867 19048 18999 19260 19630 18727 19356 19083 18926 18789 19370 18861 19311 19557 19531 19436 19140 19310 19501 18721 19604 19713 19262 19563 19507 19440 19572 19513 19515 19518 19421 19470 19499 19663 19508 18871 19528 19500 19307 19288 19594 19271 ================================================ FILE: inits.py ================================================ import tensorflow as tf import numpy as np def uniform(shape, scale=0.05, name=None): """Uniform init.""" initial = tf.random_uniform(shape, minval=-scale, maxval=scale, dtype=tf.float32) return tf.Variable(initial, name=name) def glorot(shape, name=None): """Glorot & Bengio (AISTATS 2010) init.""" init_range = np.sqrt(6.0/(shape[0]+shape[1])) initial = tf.random_uniform(shape, minval=-init_range, maxval=init_range, dtype=tf.float32) return tf.Variable(initial, name=name) def zeros(shape, name=None): """All zeros.""" initial = tf.zeros(shape, dtype=tf.float32) return tf.Variable(initial, name=name) def ones(shape, name=None): """All ones.""" initial = tf.ones(shape, dtype=tf.float32) return tf.Variable(initial, name=name) ================================================ FILE: lanczos.py ================================================ import numpy as np from numpy.linalg import norm from utils import load_data as dataload import scipy.sparse as sparse import pickle from scipy.linalg import qr, svd def lanczos(A,k,q): n = A.shape[0] Q = np.zeros((n,k+1)) Q[:,0] = q/norm(q) alpha = 0 beta = 0 for i in range(k): if i == 0: q = np.dot(A,Q[:,i]) else: q = np.dot(A, Q[:,i]) - beta*Q[:,i-1] alpha = np.dot(q.T, Q[:,i]) q = q - Q[:,i]*alpha q = q - np.dot(Q[:,:i], np.dot(Q[:,:i].T, q)) # full reorthogonalization beta = norm(q) Q[:,i+1] = q/beta print(i) Q = Q[:,:k] Sigma = np.dot(Q.T, np.dot(A, Q)) # A2 = np.dot(Q[:,:k], np.dot(Sigma[:k,:k], Q[:,:k].T)) # return A2 return Q, Sigma def dense_RandomSVD(A,K): G = np.random.randn(A.shape[0],K) B = np.dot(A,G) Q,R =qr(B,mode='economic') M = np.dot(np.dot(Q, np.dot(np.dot(Q.T, A),Q)),Q.T) return M if __name__=="__main__": adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask = dataload('cora') print(adj.shape) adj = np.array(sparse.csr_matrix.todense(adj)) # np.save("ADJ_cora.npy",adj) q = np.random.randn(adj.shape[0],) Q, sigma = lanczos(adj,100,q) r = 100 A2 = np.dot(Q[:,:r], np.dot(sigma[:r,:r], Q[:,:r].T)) # u,v,a = svd(adj) err = norm(adj-A2)/norm(adj) print(err) # A = np.random.random((10000,10000)) # A = np.triu(A) + np.triu(A).T # q = np.random.random((10000,)) # K = 100 # Q, sigma = lanczos(A,K,q) # r = 100 # A2 = np.dot(Q[:,:r], np.dot(sigma[:r,:r], Q[:,:r].T)) # err = norm(A-A2)/norm(A) # print(err) ================================================ FILE: layers.py ================================================ from inits import * import tensorflow as tf flags = tf.app.flags FLAGS = flags.FLAGS # global unique layer ID dictionary for layer name assignment _LAYER_UIDS = {} def get_layer_uid(layer_name=''): """Helper function, assigns unique layer IDs.""" if layer_name not in _LAYER_UIDS: _LAYER_UIDS[layer_name] = 1 return 1 else: _LAYER_UIDS[layer_name] += 1 return _LAYER_UIDS[layer_name] def sparse_dropout(x, keep_prob, noise_shape): """Dropout for sparse tensors.""" random_tensor = keep_prob random_tensor += tf.random_uniform(noise_shape) dropout_mask = tf.cast(tf.floor(random_tensor), dtype=tf.bool) pre_out = tf.sparse_retain(x, dropout_mask) return pre_out * (1./keep_prob) def dot(x, y, sparse=False): """Wrapper for tf.matmul (sparse vs dense).""" if sparse: res = tf.sparse_tensor_dense_matmul(x, y) else: res = tf.matmul(x, y) return res class Layer(object): """Base layer class. Defines basic API for all layer objects. Implementation inspired by keras (http://keras.io). # Properties name: String, defines the variable scope of the layer. logging: Boolean, switches Tensorflow histogram logging on/off # Methods _call(inputs): Defines computation graph of layer (i.e. takes input, returns output) __call__(inputs): Wrapper for _call() _log_vars(): Log all variables """ def __init__(self, **kwargs): allowed_kwargs = {'name', 'logging'} for kwarg in kwargs.keys(): assert kwarg in allowed_kwargs, 'Invalid keyword argument: ' + kwarg name = kwargs.get('name') if not name: layer = self.__class__.__name__.lower() name = layer + '_' + str(get_layer_uid(layer)) self.name = name self.vars = {} logging = kwargs.get('logging', False) self.logging = logging self.sparse_inputs = False def _call(self, inputs): return inputs def __call__(self, inputs): with tf.name_scope(self.name): if self.logging and not self.sparse_inputs: tf.summary.histogram(self.name + '/inputs', inputs) outputs = self._call(inputs) if self.logging: tf.summary.histogram(self.name + '/outputs', outputs) return outputs def _log_vars(self): for var in self.vars: tf.summary.histogram(self.name + '/vars/' + var, self.vars[var]) class Dense(Layer): """Dense layer.""" def __init__(self, input_dim, output_dim, placeholders, dropout=0., sparse_inputs=False, act=tf.nn.relu, bias=False, featureless=False, **kwargs): super(Dense, self).__init__(**kwargs) if dropout: self.dropout = placeholders['dropout'] else: self.dropout = 0. self.act = act self.sparse_inputs = sparse_inputs self.featureless = featureless self.bias = bias # helper variable for sparse dropout self.num_features_nonzero = placeholders['num_features_nonzero'] with tf.variable_scope(self.name + '_vars'): self.vars['weights'] = glorot([input_dim, output_dim], name='weights') if self.bias: self.vars['bias'] = zeros([output_dim], name='bias') if self.logging: self._log_vars() def _call(self, inputs): x = inputs # dropout if self.sparse_inputs: x = sparse_dropout(x, 1-self.dropout, self.num_features_nonzero) else: x = tf.nn.dropout(x, 1-self.dropout) # transform output = dot(x, self.vars['weights'], sparse=self.sparse_inputs) # bias if self.bias: output += self.vars['bias'] return self.act(output) class GraphConvolution(Layer): """Graph convolution layer.""" def __init__(self, input_dim, output_dim, placeholders, dropout=0., support=None, sparse_inputs=False, act=tf.nn.relu, bias=False, featureless=False, **kwargs): super(GraphConvolution, self).__init__(**kwargs) if dropout: self.dropout = placeholders['dropout'] else: self.dropout = 0. self.act = act if support is None: self.support = placeholders['support'][0] else: self.support = support self.sparse_inputs = sparse_inputs self.featureless = featureless self.bias = bias # helper variable for sparse dropout self.num_features_nonzero = placeholders['num_features_nonzero'] with tf.variable_scope(self.name + '_vars'): for i in range(1): self.vars['weights_' + str(i)] = glorot([input_dim, output_dim], name='weights_' + str(i)) if self.bias: self.vars['bias'] = zeros([output_dim], name='bias') if self.logging: self._log_vars() def _call(self, inputs): x = inputs # dropout if self.sparse_inputs: x = sparse_dropout(x, 1-self.dropout, self.num_features_nonzero) else: x = tf.nn.dropout(x, 1-self.dropout) # convolve # supports = list() # for i in range(len(self.support)): # if not self.featureless: # pre_sup = dot(x, self.vars['weights_' + str(i)], # sparse=self.sparse_inputs) # else: # pre_sup = self.vars['weights_' + str(i)] # support = dot(self.support[i], pre_sup, sparse=True) # supports.append(support) # output = tf.add_n(supports) if not self.featureless: pre_sup = dot(x, self.vars['weights_0'], sparse=self.sparse_inputs) else: pre_sup = self.vars['weights_0'] output = dot(self.support, pre_sup, sparse=True) # bias if self.bias: output += self.vars['bias'] return self.act(output) class SampledGraphConvolution(Layer): """Graph convolution layer.""" def __init__(self, input_dim, output_dim, placeholders, dropout=0., rank = 100, support=None, sparse_inputs=False, act=tf.nn.relu, bias=False, featureless=False, **kwargs): super(SampledGraphConvolution, self).__init__(**kwargs) if dropout: self.dropout = placeholders['dropout'] else: self.dropout = 0. self.act = act if support is None: self.support = placeholders['support'][0] else: self.support = support self.sparse_inputs = sparse_inputs self.featureless = featureless self.bias = bias # helper variable for sparse dropout self.num_features_nonzero = placeholders['num_features_nonzero'] self.rank = rank with tf.variable_scope(self.name + '_vars'): for i in range(1): self.vars['weights_' + str(i)] = glorot([input_dim, output_dim], name='weights_' + str(i)) if self.bias: self.vars['bias'] = zeros([output_dim], name='bias') if self.logging: self._log_vars() def _call(self, inputs): x = inputs norm_x = tf.nn.l2_normalize(x, axis=1) norm_support = tf.nn.l2_normalize(self.support, axis=0) norm_mix = tf.cross(norm_x, norm_support) norm_mix = norm_mix*tf.inv(tf.reduce_sum(norm_mix)) sampledIndex = tf.multinomial(tf.log(norm_mix), self.rank) new_support = dot(self.support,tf.diag(norm_mix),sparse=True) # dropout if self.sparse_inputs: x = sparse_dropout(x, 1-self.dropout, self.num_features_nonzero) else: x = tf.nn.dropout(x, 1-self.dropout) # convolve # supports = list() # for i in range(len(self.support)): # if not self.featureless: # pre_sup = dot(x, self.vars['weights_' + str(i)], # sparse=self.sparse_inputs) # else: # pre_sup = self.vars['weights_' + str(i)] # support = dot(self.support[i], pre_sup, sparse=True) # supports.append(support) # output = tf.add_n(supports) if not self.featureless: pre_sup = dot(x, self.vars['weights_0'], sparse=self.sparse_inputs) else: pre_sup = self.vars['weights_0'] output = dot(new_support, pre_sup, sparse=True) # bias if self.bias: output += self.vars['bias'] return self.act(output) ================================================ FILE: metrics.py ================================================ import tensorflow as tf def masked_softmax_cross_entropy(preds, labels, mask): """Softmax cross-entropy loss with masking.""" loss = tf.nn.softmax_cross_entropy_with_logits(logits=preds, labels=labels) mask = tf.cast(mask, dtype=tf.float32) mask /= tf.reduce_mean(mask) loss *= mask return tf.reduce_mean(loss) def masked_accuracy(preds, labels, mask): """Accuracy with masking.""" correct_prediction = tf.equal(tf.argmax(preds, 1), tf.argmax(labels, 1)) accuracy_all = tf.cast(correct_prediction, tf.float32) mask = tf.cast(mask, dtype=tf.float32) mask /= tf.reduce_mean(mask) accuracy_all *= mask return tf.reduce_mean(accuracy_all) def softmax_cross_entropy(preds, labels): loss = tf.nn.softmax_cross_entropy_with_logits(logits=preds, labels=labels) return tf.reduce_mean(loss) def accuracy(preds, labels): correct_prediction = tf.equal(tf.argmax(preds, 1), tf.argmax(labels, 1)) accuracy_all = tf.cast(correct_prediction, tf.float32) return tf.reduce_mean(accuracy_all) ================================================ FILE: models.py ================================================ from layers import * from metrics import * flags = tf.app.flags FLAGS = flags.FLAGS class Model(object): def __init__(self, **kwargs): allowed_kwargs = {'name', 'logging'} for kwarg in kwargs.keys(): assert kwarg in allowed_kwargs, 'Invalid keyword argument: ' + kwarg name = kwargs.get('name') if not name: name = self.__class__.__name__.lower() self.name = name logging = kwargs.get('logging', False) self.logging = logging self.vars = {} self.placeholders = {} self.layers = [] self.activations = [] self.inputs = None self.outputs = None self.loss = 0 self.accuracy = 0 self.optimizer = None self.opt_op = None def _build(self): raise NotImplementedError def build(self): """ Wrapper for _build() """ with tf.variable_scope(self.name): self._build() # Build sequential layer model self.activations.append(self.inputs) for layer in self.layers: hidden = layer(self.activations[-1]) self.activations.append(hidden) self.outputs = self.activations[-1] # Store model variables for easy access variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.name) self.vars = {var.name: var for var in variables} # Build metrics self._loss() self._accuracy() self.opt_op = self.optimizer.minimize(self.loss) def predict(self): pass def _loss(self): raise NotImplementedError def _accuracy(self): raise NotImplementedError def save(self, sess=None): if not sess: raise AttributeError("TensorFlow session not provided.") saver = tf.train.Saver(self.vars) save_path = saver.save(sess, "tmp/%s.ckpt" % self.name) print("Model saved in file: %s" % save_path) def load(self, sess=None): if not sess: raise AttributeError("TensorFlow session not provided.") saver = tf.train.Saver(self.vars) save_path = "tmp/%s.ckpt" % self.name saver.restore(sess, save_path) print("Model restored from file: %s" % save_path) class MLP(Model): def __init__(self, placeholders, input_dim, **kwargs): super(MLP, self).__init__(**kwargs) self.inputs = placeholders['features'] self.input_dim = input_dim # self.input_dim = self.inputs.get_shape().as_list()[1] # To be supported in future Tensorflow versions self.output_dim = placeholders['labels'].get_shape().as_list()[1] self.placeholders = placeholders self.optimizer = tf.train.AdamOptimizer(learning_rate=FLAGS.learning_rate) self.build() def _loss(self): # Weight decay loss for var in self.layers[0].vars.values(): self.loss += FLAGS.weight_decay * tf.nn.l2_loss(var) # Cross entropy error self.loss += masked_softmax_cross_entropy(self.outputs, self.placeholders['labels'], self.placeholders['labels_mask']) def _accuracy(self): self.accuracy = masked_accuracy(self.outputs, self.placeholders['labels'], self.placeholders['labels_mask']) def _build(self): self.layers.append(Dense(input_dim=self.input_dim, output_dim=FLAGS.hidden1, placeholders=self.placeholders, act=tf.nn.relu, dropout=True, sparse_inputs=True, logging=self.logging)) self.layers.append(Dense(input_dim=FLAGS.hidden1, output_dim=self.output_dim, placeholders=self.placeholders, act=lambda x: x, dropout=True, logging=self.logging)) def predict(self): return tf.nn.softmax(self.outputs) class GCN(Model): def __init__(self, placeholders, input_dim, **kwargs): super(GCN, self).__init__(**kwargs) self.inputs = placeholders['features'] self.input_dim = input_dim # self.input_dim = self.inputs.get_shape().as_list()[1] # To be supported in future Tensorflow versions self.output_dim = placeholders['labels'].get_shape().as_list()[1] self.placeholders = placeholders self.optimizer = tf.train.AdamOptimizer(learning_rate=FLAGS.learning_rate) self.build() def _loss(self): # Weight decay loss for var in self.layers[0].vars.values(): self.loss += FLAGS.weight_decay * tf.nn.l2_loss(var) # Cross entropy error self.loss += masked_softmax_cross_entropy(self.outputs, self.placeholders['labels'], self.placeholders['labels_mask']) def _accuracy(self): self.accuracy = masked_accuracy(self.outputs, self.placeholders['labels'], self.placeholders['labels_mask']) def _build(self): self.layers.append(GraphConvolution(input_dim=self.input_dim, output_dim=FLAGS.hidden1, placeholders=self.placeholders, act=tf.nn.relu, dropout=True, sparse_inputs=True, logging=self.logging)) self.layers.append(GraphConvolution(input_dim=FLAGS.hidden1, output_dim=self.output_dim, placeholders=self.placeholders, act=lambda x: x, dropout=True, logging=self.logging)) def predict(self): return tf.nn.softmax(self.outputs) class GCN_APPRO(Model): def __init__(self, placeholders, input_dim, **kwargs): super(GCN_APPRO, self).__init__(**kwargs) self.inputs = placeholders['features'] self.input_dim = input_dim # self.input_dim = self.inputs.get_shape().as_list()[1] # To be supported in future Tensorflow versions self.output_dim = placeholders['labels'].get_shape().as_list()[1] self.placeholders = placeholders self.supports = placeholders['support'] self.optimizer = tf.train.AdamOptimizer(learning_rate=FLAGS.learning_rate) self.build() def _loss(self): # Weight decay loss for var in self.layers[0].vars.values(): self.loss += FLAGS.weight_decay * tf.nn.l2_loss(var) # Cross entropy error self.loss += softmax_cross_entropy(self.outputs, self.placeholders['labels']) def _accuracy(self): self.accuracy = accuracy(self.outputs, self.placeholders['labels']) def _build(self): # appr_support = self.placeholders['support'][0] self.layers.append(GraphConvolution(input_dim=self.input_dim, output_dim=FLAGS.hidden1, placeholders=self.placeholders, support=self.supports[0], act=tf.nn.relu, dropout=True, sparse_inputs=False, logging=self.logging)) self.layers.append(GraphConvolution(input_dim=FLAGS.hidden1, output_dim=self.output_dim, placeholders=self.placeholders, support=self.supports[1], act=lambda x: x, dropout=True, logging=self.logging)) def predict(self): return tf.nn.softmax(self.outputs) class GCN_APPRO_Mix(Model): #mixture of dense and gcn def __init__(self, placeholders, input_dim, **kwargs): super(GCN_APPRO_Mix, self).__init__(**kwargs) self.inputs = placeholders['AXfeatures']# A*X for the bottom layer, not original feature X self.input_dim = input_dim # self.input_dim = self.inputs.get_shape().as_list()[1] # To be supported in future Tensorflow versions self.output_dim = placeholders['labels'].get_shape().as_list()[1] self.placeholders = placeholders self.support = placeholders['support'] self.optimizer = tf.train.AdamOptimizer(learning_rate=FLAGS.learning_rate) self.build() def _loss(self): # Weight decay loss for var in self.layers[0].vars.values(): self.loss += FLAGS.weight_decay * tf.nn.l2_loss(var) # Cross entropy error self.loss += softmax_cross_entropy(self.outputs, self.placeholders['labels']) def _accuracy(self): self.accuracy = accuracy(self.outputs, self.placeholders['labels']) def _build(self): self.layers.append(Dense(input_dim=self.input_dim, output_dim=FLAGS.hidden1, placeholders=self.placeholders, act=tf.nn.relu, dropout=True, sparse_inputs=False, logging=self.logging)) self.layers.append(GraphConvolution(input_dim=FLAGS.hidden1, output_dim=self.output_dim, placeholders=self.placeholders, support=self.support, act=lambda x: x, dropout=True, logging=self.logging)) def predict(self): return tf.nn.softmax(self.outputs) class GCN_APPRO_Onelayer(Model): def __init__(self, placeholders, input_dim, **kwargs): super(GCN_APPRO_Onelayer, self).__init__(**kwargs) self.inputs = placeholders['features'] self.input_dim = input_dim # self.input_dim = self.inputs.get_shape().as_list()[1] # To be supported in future Tensorflow versions self.output_dim = placeholders['labels'].get_shape().as_list()[1] self.placeholders = placeholders self.supports = placeholders['support'] self.optimizer = tf.train.AdamOptimizer(learning_rate=FLAGS.learning_rate) self.build() def _loss(self): # Weight decay loss for var in self.layers[0].vars.values(): self.loss += FLAGS.weight_decay * tf.nn.l2_loss(var) # Cross entropy error self.loss += masked_softmax_cross_entropy(self.outputs, self.placeholders['labels'], self.placeholders['labels_mask']) def _accuracy(self): self.accuracy = masked_accuracy(self.outputs, self.placeholders['labels'], self.placeholders['labels_mask']) def _build(self): appr_support = self.placeholders['support'][0] self.layers.append(GraphConvolution(input_dim=self.input_dim, output_dim=self.output_dim, placeholders=self.placeholders, support=self.supports[0], act=tf.nn.relu, dropout=True, sparse_inputs=True, logging=self.logging)) def predict(self): return tf.nn.softmax(self.outputs) ================================================ FILE: pubmed-original_inductive_FastGCN.py ================================================ from __future__ import division from __future__ import print_function import time import tensorflow as tf import scipy.sparse as sp import os from utils import * from models import GCN_APPRO_Mix # Set random seed seed = 123 np.random.seed(seed) tf.set_random_seed(seed) # Settings flags = tf.app.flags FLAGS = flags.FLAGS flags.DEFINE_string('dataset', 'pubmed', 'Dataset string.') # 'cora', 'citeseer', 'pubmed' flags.DEFINE_string('model', 'gcn_mix', 'Model string.') # 'gcn_mix', 'gcn_appr' flags.DEFINE_float('learning_rate', 0.01, 'Initial learning rate.') flags.DEFINE_integer('epochs', 100, 'Number of epochs to train.') flags.DEFINE_integer('hidden1', 16, 'Number of units in hidden layer 1.') flags.DEFINE_float('dropout', 0.0, 'Dropout rate (1 - keep probability).') flags.DEFINE_float('weight_decay', 5e-4, 'Weight for L2 loss on embedding matrix.') flags.DEFINE_integer('early_stopping', 10, 'Tolerance for early stopping (# of epochs).') flags.DEFINE_integer('max_degree', 3, 'Maximum Chebyshev polynomial degree.') def construct_feeddict_forMixlayers(AXfeatures, support, labels, placeholders): feed_dict = dict() feed_dict.update({placeholders['labels']: labels}) feed_dict.update({placeholders['AXfeatures']: AXfeatures}) feed_dict.update({placeholders['support']: support}) feed_dict.update({placeholders['num_features_nonzero']: AXfeatures[1].shape}) return feed_dict def iterate_minibatches_listinputs(inputs, batchsize, shuffle=False): assert inputs is not None numSamples = inputs[0].shape[0] if shuffle: indices = np.arange(numSamples) np.random.shuffle(indices) for start_idx in range(0, numSamples - batchsize + 1, batchsize): if shuffle: excerpt = indices[start_idx:start_idx + batchsize] else: excerpt = slice(start_idx, start_idx + batchsize) yield [input[excerpt] for input in inputs] def main(rank1): adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask = load_data_original(FLAGS.dataset) train_index = np.where(train_mask)[0] adj_train = adj[train_index, :][:, train_index] train_mask = train_mask[train_index] y_train = y_train[train_index] val_index = np.where(val_mask)[0] y_val = y_val[val_index] test_index = np.where(test_mask)[0] y_test = y_test[test_index] train_val_index = np.concatenate([train_index, val_index],axis=0) train_test_idnex = np.concatenate([train_index, test_index],axis=0) numNode_train = adj_train.shape[0] # print("numNode", numNode) if FLAGS.model == 'gcn_mix': normADJ_train = nontuple_preprocess_adj(adj_train) # normADJ = nontuple_preprocess_adj(adj) normADJ_val = nontuple_preprocess_adj(adj[train_val_index,:][:,train_val_index]) normADJ_test = nontuple_preprocess_adj(adj[train_test_idnex,:][:,train_test_idnex]) num_supports = 2 model_func = GCN_APPRO_Mix else: raise ValueError('Invalid argument for model: ' + str(FLAGS.model)) # Some preprocessing features = nontuple_preprocess_features(features).todense() train_features = normADJ_train.dot(features[train_index]) val_features = normADJ_val.dot(features[train_val_index]) test_features = normADJ_test.dot(features[train_test_idnex]) nonzero_feature_number = len(np.nonzero(features)[0]) nonzero_feature_number_train = len(np.nonzero(train_features)[0]) # Define placeholders placeholders = { 'support': tf.sparse_placeholder(tf.float32) , 'AXfeatures': tf.placeholder(tf.float32, shape=(None, features.shape[1])), 'labels': tf.placeholder(tf.float32, shape=(None, y_train.shape[1])), 'dropout': tf.placeholder_with_default(0., shape=()), 'num_features_nonzero': tf.placeholder(tf.int32) # helper variable for sparse dropout } # Create model model = model_func(placeholders, input_dim=features.shape[-1], logging=True) # Initialize session sess = tf.Session() # Define model evaluation function def evaluate(features, support, labels, placeholders): t_test = time.time() feed_dict_val = construct_feeddict_forMixlayers(features, support, labels, placeholders) outs_val = sess.run([model.loss, model.accuracy], feed_dict=feed_dict_val) return outs_val[0], outs_val[1], (time.time() - t_test) # Init variables sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() cost_val = [] p0 = column_prop(normADJ_train) # testSupport = [sparse_to_tuple(normADJ), sparse_to_tuple(normADJ)] valSupport = sparse_to_tuple(normADJ_val[len(train_index):, :]) testSupport = sparse_to_tuple(normADJ_test[len(train_index):, :]) t = time.time() maxACC = 0.0 # Train model for epoch in range(FLAGS.epochs): t1 = time.time() n = 0 for batch in iterate_minibatches_listinputs([normADJ_train, y_train], batchsize=20, shuffle=True): [normADJ_batch, y_train_batch] = batch if rank1 is None: support1 = sparse_to_tuple(normADJ_batch) features_inputs = train_features else: distr = np.nonzero(np.sum(normADJ_batch, axis=0))[1] if rank1 > len(distr): q1 = distr else: q1 = np.random.choice(distr, rank1, replace=False, p=p0[distr]/sum(p0[distr])) # top layer support1 = sparse_to_tuple(normADJ_batch[:, q1].dot(sp.diags(1.0 / (p0[q1] * rank1)))) if len(support1[1])==0: continue features_inputs = train_features[q1, :] # selected nodes for approximation # Construct feed dictionary feed_dict = construct_feeddict_forMixlayers(features_inputs, support1, y_train_batch, placeholders) feed_dict.update({placeholders['dropout']: FLAGS.dropout}) # Training step outs = sess.run([model.opt_op, model.loss, model.accuracy], feed_dict=feed_dict) n = n +1 # Validation cost, acc, duration = evaluate(val_features, valSupport, y_val, placeholders) cost_val.append(cost) # if epoch > 50 and acc>maxACC: # maxACC = acc # save_path = saver.save(sess, "tmp/tmp_MixModel.ckpt") # Print results print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(outs[1]), "train_acc=", "{:.5f}".format(outs[2]), "val_loss=", "{:.5f}".format(cost), "val_acc=", "{:.5f}".format(acc), "time per batch=", "{:.5f}".format((time.time() - t1)/n)) if epoch%5==0: # Validation test_cost, test_acc, test_duration = evaluate(test_features, testSupport, y_test, placeholders) print("training time by far=", "{:.5f}".format(time.time() - t), "epoch = {}".format(epoch + 1), "cost=", "{:.5f}".format(test_cost), "accuracy=", "{:.5f}".format(test_acc)) if epoch > FLAGS.early_stopping and np.mean(cost_val[-2:]) > np.mean(cost_val[-(FLAGS.early_stopping + 1):-1]): # print("Early stopping...") break train_duration = time.time() - t # Testing # if os.path.exists("tmp/pubmed_MixModel.ckpt"): # saver.restore(sess, "tmp/pubmed_MixModel.ckpt") test_cost, test_acc, test_duration = evaluate(test_features, testSupport, y_test, placeholders) print("rank1 = {}".format(rank1), "cost=", "{:.5f}".format(test_cost), "accuracy=", "{:.5f}".format(test_acc), "training time=", "{:.5f}".format(train_duration), "training time per epoch=", "{:.5f}".format(train_duration/(epoch+1)), "test time=", "{:.5f}".format(test_duration)) if __name__=="__main__": print("DATASET:", FLAGS.dataset) # main(None) main(100) # for k in [5, 10, 25, 50]: # main(k) ================================================ FILE: pubmed-original_transductive_FastGCN.py ================================================ from __future__ import division from __future__ import print_function import time import tensorflow as tf import scipy.sparse as sp import os from utils import * from models import GCN_APPRO_Mix # Set random seed seed = 123 np.random.seed(seed) tf.set_random_seed(seed) # Settings flags = tf.app.flags FLAGS = flags.FLAGS flags.DEFINE_string('dataset', 'pubmed', 'Dataset string.') # 'cora', 'citeseer', 'pubmed' flags.DEFINE_string('model', 'gcn_mix', 'Model string.') # 'gcn_mix', 'gcn_appr' flags.DEFINE_float('learning_rate', 0.01, 'Initial learning rate.') flags.DEFINE_integer('epochs', 100, 'Number of epochs to train.') flags.DEFINE_integer('hidden1', 16, 'Number of units in hidden layer 1.') flags.DEFINE_float('dropout', 0.0, 'Dropout rate (1 - keep probability).') flags.DEFINE_float('weight_decay', 5e-4, 'Weight for L2 loss on embedding matrix.') flags.DEFINE_integer('early_stopping', 10, 'Tolerance for early stopping (# of epochs).') flags.DEFINE_integer('max_degree', 3, 'Maximum Chebyshev polynomial degree.') def construct_feeddict_forMixlayers(AXfeatures, support, labels, placeholders): feed_dict = dict() feed_dict.update({placeholders['labels']: labels}) feed_dict.update({placeholders['AXfeatures']: AXfeatures}) feed_dict.update({placeholders['support']: support}) feed_dict.update({placeholders['num_features_nonzero']: AXfeatures[1].shape}) return feed_dict def iterate_minibatches_listinputs(inputs, batchsize, shuffle=False): assert inputs is not None numSamples = inputs[0].shape[0] if shuffle: indices = np.arange(numSamples) np.random.shuffle(indices) for start_idx in range(0, numSamples - batchsize + 1, batchsize): if shuffle: excerpt = indices[start_idx:start_idx + batchsize] else: excerpt = slice(start_idx, start_idx + batchsize) yield [input[excerpt] for input in inputs] def main(rank1): adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask = load_data_original(FLAGS.dataset) train_index = np.where(train_mask)[0] adj_train = adj[train_index, :][:] train_mask = train_mask[train_index] y_train = y_train[train_index] val_index = np.where(val_mask)[0] y_val = y_val[val_index] test_index = np.where(test_mask)[0] y_test = y_test[test_index] train_val_index = np.concatenate([train_index, val_index],axis=0) train_test_index = np.concatenate([train_index, test_index],axis=0) numNode_train = adj_train.shape[0] # print("numNode", numNode) if FLAGS.model == 'gcn_mix': normADJ = nontuple_preprocess_adj(adj) # normADJ_train = nontuple_preprocess_adj(adj_train) # normADJ_val = nontuple_preprocess_adj(adj[train_val_index,:][:]) # normADJ_test = nontuple_preprocess_adj(adj[train_test_idnex,:][:]) normADJ_train = normADJ[train_index,:][:] normADJ_val = normADJ[train_val_index, :][:] normADJ_test = normADJ[train_test_index, :][:] num_supports = 2 model_func = GCN_APPRO_Mix else: raise ValueError('Invalid argument for model: ' + str(FLAGS.model)) # Some preprocessing features = nontuple_preprocess_features(features).todense() ax_features = normADJ.dot(features[:]) # val_features = normADJ_val.dot(features[train_val_index]) # test_features = normADJ_test.dot(features[train_test_idnex]) nonzero_feature_number = len(np.nonzero(features)[0]) nonzero_feature_number_train = len(np.nonzero(ax_features)[0]) # Define placeholders placeholders = { 'support': tf.sparse_placeholder(tf.float32) , 'AXfeatures': tf.placeholder(tf.float32, shape=(None, features.shape[1])), 'labels': tf.placeholder(tf.float32, shape=(None, y_train.shape[1])), 'dropout': tf.placeholder_with_default(0., shape=()), 'labels_mask': tf.placeholder(tf.int32), 'num_features_nonzero': tf.placeholder(tf.int32) # helper variable for sparse dropout } # Create model model = model_func(placeholders, input_dim=features.shape[-1], logging=True) # Initialize session sess = tf.Session() # Define model evaluation function def evaluate(features, support, labels, placeholders): t_test = time.time() feed_dict_val = construct_feeddict_forMixlayers(features, support, labels, placeholders) outs_val = sess.run([model.loss, model.accuracy], feed_dict=feed_dict_val) return outs_val[0], outs_val[1], (time.time() - t_test) # Init variables sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() cost_val = [] p0 = column_prop(normADJ) # testSupport = [sparse_to_tuple(normADJ), sparse_to_tuple(normADJ)] valSupport = sparse_to_tuple(normADJ_val[len(train_index):, :]) testSupport = sparse_to_tuple(normADJ_test[len(train_index):, :]) t = time.time() maxACC = 0.0 # Train model for epoch in range(FLAGS.epochs): t1 = time.time() n = 0 for batch in iterate_minibatches_listinputs([normADJ_train, y_train], batchsize=20, shuffle=True): [normADJ_batch, y_train_batch] = batch if rank1 is None: support1 = sparse_to_tuple(normADJ_batch) features_inputs = ax_features else: distr = np.nonzero(np.sum(normADJ_batch, axis=0))[1] if rank1 > len(distr): q1 = distr else: q1 = np.random.choice(distr, rank1, replace=False, p=p0[distr]/sum(p0[distr])) # top layer support1 = sparse_to_tuple(normADJ_batch[:, q1].dot(sp.diags(1.0 / (p0[q1] * rank1)))) if len(support1[1])==0: continue features_inputs = ax_features[q1, :] # selected nodes for approximation # Construct feed dictionary feed_dict = construct_feeddict_forMixlayers(features_inputs, support1, y_train_batch, placeholders) feed_dict.update({placeholders['dropout']: FLAGS.dropout}) # Training step outs = sess.run([model.opt_op, model.loss, model.accuracy], feed_dict=feed_dict) n = n +1 # Validation cost, acc, duration = evaluate(ax_features, valSupport, y_val, placeholders) cost_val.append(cost) # if epoch > 50 and acc>maxACC: # maxACC = acc # save_path = saver.save(sess, "tmp/tmp_MixModel.ckpt") # Print results print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(outs[1]), "train_acc=", "{:.5f}".format(outs[2]), "val_loss=", "{:.5f}".format(cost), "val_acc=", "{:.5f}".format(acc), "time per batch=", "{:.5f}".format((time.time() - t1)/n)) # if epoch%5==0: # # Validation # test_cost, test_acc, test_duration = evaluate(ax_features, testSupport, y_test, # placeholders) # print("training time by far=", "{:.5f}".format(time.time() - t), # "epoch = {}".format(epoch + 1), # "cost=", "{:.5f}".format(test_cost), # "accuracy=", "{:.5f}".format(test_acc)) if epoch > FLAGS.early_stopping and np.mean(cost_val[-2:]) > np.mean(cost_val[-(FLAGS.early_stopping + 1):-1]): # print("Early stopping...") break train_duration = time.time() - t # Testing # if os.path.exists("tmp/pubmed_MixModel.ckpt"): # saver.restore(sess, "tmp/pubmed_MixModel.ckpt") test_cost, test_acc, test_duration = evaluate(ax_features, testSupport, y_test, placeholders) print("rank1 = {}".format(rank1), "cost=", "{:.5f}".format(test_cost), "accuracy=", "{:.5f}".format(test_acc), "training time=", "{:.5f}".format(train_duration), "training time per epoch=", "{:.5f}".format(train_duration/(epoch+1)), "test time=", "{:.5f}".format(test_duration)) if __name__=="__main__": print("DATASET:", FLAGS.dataset) main(400) # main(100) # for k in [5, 10, 25, 50]: # main(k) ================================================ FILE: pubmed_Mix.py ================================================ from __future__ import division from __future__ import print_function import time import tensorflow as tf import scipy.sparse as sp import os from utils import * from models import GCN_APPRO_Mix # Set random seed seed = 123 np.random.seed(seed) tf.set_random_seed(seed) # Settings flags = tf.app.flags FLAGS = flags.FLAGS flags.DEFINE_string('dataset', 'pubmed', 'Dataset string.') # 'cora', 'citeseer', 'pubmed' flags.DEFINE_string('model', 'gcn_mix', 'Model string.') # 'gcn_mix', 'gcn_appr' flags.DEFINE_float('learning_rate', 0.001, 'Initial learning rate.') flags.DEFINE_integer('epochs', 200, 'Number of epochs to train.') flags.DEFINE_integer('hidden1', 16, 'Number of units in hidden layer 1.') flags.DEFINE_float('dropout', 0.0, 'Dropout rate (1 - keep probability).') flags.DEFINE_float('weight_decay', 5e-4, 'Weight for L2 loss on embedding matrix.') flags.DEFINE_integer('early_stopping', 30, 'Tolerance for early stopping (# of epochs).') flags.DEFINE_integer('max_degree', 3, 'Maximum Chebyshev polynomial degree.') def construct_feeddict_forMixlayers(AXfeatures, support, labels, placeholders): feed_dict = dict() feed_dict.update({placeholders['labels']: labels}) feed_dict.update({placeholders['AXfeatures']: AXfeatures}) feed_dict.update({placeholders['support']: support}) feed_dict.update({placeholders['num_features_nonzero']: AXfeatures[1].shape}) return feed_dict def iterate_minibatches_listinputs(inputs, batchsize, shuffle=False): assert inputs is not None numSamples = inputs[0].shape[0] if shuffle: indices = np.arange(numSamples) np.random.shuffle(indices) for start_idx in range(0, numSamples - batchsize + 1, batchsize): if shuffle: excerpt = indices[start_idx:start_idx + batchsize] else: excerpt = slice(start_idx, start_idx + batchsize) yield [input[excerpt] for input in inputs] def main(rank1): adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask = load_data(FLAGS.dataset) train_index = np.where(train_mask)[0] adj_train = adj[train_index, :][:, train_index] train_mask = train_mask[train_index] y_train = y_train[train_index] val_index = np.where(val_mask)[0] y_val = y_val[val_index] test_index = np.where(test_mask)[0] y_test = y_test[test_index] train_val_index = np.concatenate([train_index, val_index],axis=0) train_test_idnex = np.concatenate([train_index, test_index],axis=0) numNode_train = adj_train.shape[0] # print("numNode", numNode) if FLAGS.model == 'gcn_mix': normADJ_train = nontuple_preprocess_adj(adj_train) # normADJ = nontuple_preprocess_adj(adj) normADJ_val = nontuple_preprocess_adj(adj[train_val_index,:][:,train_val_index]) normADJ_test = nontuple_preprocess_adj(adj[train_test_idnex,:][:,train_test_idnex]) num_supports = 2 model_func = GCN_APPRO_Mix else: raise ValueError('Invalid argument for model: ' + str(FLAGS.model)) # Some preprocessing features = nontuple_preprocess_features(features).todense() train_features = normADJ_train.dot(features[train_index]) val_features = normADJ_val.dot(features[train_val_index]) test_features = normADJ_test.dot(features[train_test_idnex]) nonzero_feature_number = len(np.nonzero(features)[0]) nonzero_feature_number_train = len(np.nonzero(train_features)[0]) # Define placeholders placeholders = { 'support': tf.sparse_placeholder(tf.float32) , 'AXfeatures': tf.placeholder(tf.float32, shape=(None, features.shape[1])), 'labels': tf.placeholder(tf.float32, shape=(None, y_train.shape[1])), 'dropout': tf.placeholder_with_default(0., shape=()), 'num_features_nonzero': tf.placeholder(tf.int32) # helper variable for sparse dropout } # Create model model = model_func(placeholders, input_dim=features.shape[-1], logging=True) # Initialize session sess = tf.Session() # Define model evaluation function def evaluate(features, support, labels, placeholders): t_test = time.time() feed_dict_val = construct_feeddict_forMixlayers(features, support, labels, placeholders) outs_val = sess.run([model.loss, model.accuracy], feed_dict=feed_dict_val) return outs_val[0], outs_val[1], (time.time() - t_test) # Init variables sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() cost_val = [] p0 = column_prop(normADJ_train) # testSupport = [sparse_to_tuple(normADJ), sparse_to_tuple(normADJ)] valSupport = sparse_to_tuple(normADJ_val[len(train_index):, :]) testSupport = sparse_to_tuple(normADJ_test[len(train_index):, :]) t = time.time() maxACC = 0.0 # Train model for epoch in range(FLAGS.epochs): t1 = time.time() n = 0 for batch in iterate_minibatches_listinputs([normADJ_train, y_train], batchsize=1024, shuffle=True): [normADJ_batch, y_train_batch] = batch p1 = column_prop(normADJ_batch) if rank1 is None: support1 = sparse_to_tuple(normADJ_batch) features_inputs = train_features else: q1 = np.random.choice(np.arange(numNode_train), rank1, replace=False, p=p1) # top layer support1 = sparse_to_tuple(normADJ_batch[:, q1].dot(sp.diags(1.0 / (p1[q1] * rank1)))) features_inputs = train_features[q1, :] # selected nodes for approximation # Construct feed dictionary feed_dict = construct_feeddict_forMixlayers(features_inputs, support1, y_train_batch, placeholders) feed_dict.update({placeholders['dropout']: FLAGS.dropout}) # Training step outs = sess.run([model.opt_op, model.loss, model.accuracy], feed_dict=feed_dict) n = n +1 # Validation cost, acc, duration = evaluate(val_features, valSupport, y_val, placeholders) cost_val.append(cost) # if epoch > 50 and acc>maxACC: # maxACC = acc # save_path = saver.save(sess, "tmp/tmp_MixModel.ckpt") # Print results # print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(outs[1]), # "train_acc=", "{:.5f}".format(outs[2]), "val_loss=", "{:.5f}".format(cost), # "val_acc=", "{:.5f}".format(acc), "time per batch=", "{:.5f}".format((time.time() - t1)/n)) if epoch > FLAGS.early_stopping and np.mean(cost_val[-2:]) > np.mean(cost_val[-(FLAGS.early_stopping + 1):-1]): # print("Early stopping...") break train_duration = time.time() - t # Testing # if os.path.exists("tmp/pubmed_MixModel.ckpt"): # saver.restore(sess, "tmp/pubmed_MixModel.ckpt") test_cost, test_acc, test_duration = evaluate(test_features, testSupport, y_test, placeholders) print("rank1 = {}".format(rank1), "cost=", "{:.5f}".format(test_cost), "accuracy=", "{:.5f}".format(test_acc), "training time per epoch=", "{:.5f}".format(train_duration/(epoch+1)), "test time=", "{:.5f}".format(test_duration)) if __name__=="__main__": print("DATASET:", FLAGS.dataset) for k in [25, 50, 100, 200, 400]: main(k) ================================================ FILE: pubmed_Mix_sampleA.py ================================================ from __future__ import division from __future__ import print_function import time import tensorflow as tf import scipy.sparse as sp import os from utils import * from models import GCN_APPRO_Mix # Set random seed seed = 123 np.random.seed(seed) tf.set_random_seed(seed) # Settings flags = tf.app.flags FLAGS = flags.FLAGS flags.DEFINE_string('dataset', 'pubmed', 'Dataset string.') # 'cora', 'citeseer', 'pubmed' flags.DEFINE_string('model', 'gcn_mix', 'Model string.') # 'gcn_mix', 'gcn_appr' flags.DEFINE_float('learning_rate', 0.01, 'Initial learning rate.') flags.DEFINE_integer('epochs', 200, 'Number of epochs to train.') flags.DEFINE_integer('hidden1', 16, 'Number of units in hidden layer 1.') flags.DEFINE_float('dropout', 0.0, 'Dropout rate (1 - keep probability).') flags.DEFINE_float('weight_decay', 5e-4, 'Weight for L2 loss on embedding matrix.') flags.DEFINE_integer('early_stopping', 30, 'Tolerance for early stopping (# of epochs).') flags.DEFINE_integer('max_degree', 3, 'Maximum Chebyshev polynomial degree.') def construct_feeddict_forMixlayers(AXfeatures, support, labels, placeholders): feed_dict = dict() feed_dict.update({placeholders['labels']: labels}) feed_dict.update({placeholders['AXfeatures']: AXfeatures}) feed_dict.update({placeholders['support']: support}) feed_dict.update({placeholders['num_features_nonzero']: AXfeatures[1].shape}) return feed_dict def iterate_minibatches_listinputs(inputs, batchsize, shuffle=False): assert inputs is not None numSamples = inputs[0].shape[0] if shuffle: indices = np.arange(numSamples) np.random.shuffle(indices) for start_idx in range(0, numSamples - batchsize + 1, batchsize): if shuffle: excerpt = indices[start_idx:start_idx + batchsize] else: excerpt = slice(start_idx, start_idx + batchsize) yield [input[excerpt] for input in inputs] def main(rank1): adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask = load_data(FLAGS.dataset) train_index = np.where(train_mask)[0] adj_train = adj[train_index, :][:, train_index] train_mask = train_mask[train_index] y_train = y_train[train_index] val_index = np.where(val_mask)[0] y_val = y_val[val_index] test_index = np.where(test_mask)[0] y_test = y_test[test_index] train_val_index = np.concatenate([train_index, val_index],axis=0) train_test_idnex = np.concatenate([train_index, test_index],axis=0) numNode_train = adj_train.shape[0] # print("numNode", numNode) if FLAGS.model == 'gcn_mix': normADJ_train = nontuple_preprocess_adj(adj_train) # normADJ = nontuple_preprocess_adj(adj) normADJ_val = nontuple_preprocess_adj(adj[train_val_index,:][:,train_val_index]) normADJ_test = nontuple_preprocess_adj(adj[train_test_idnex,:][:,train_test_idnex]) num_supports = 2 model_func = GCN_APPRO_Mix else: raise ValueError('Invalid argument for model: ' + str(FLAGS.model)) # Some preprocessing features = nontuple_preprocess_features(features).todense() train_features = normADJ_train.dot(features[train_index]) val_features = normADJ_val.dot(features[train_val_index]) test_features = normADJ_test.dot(features[train_test_idnex]) nonzero_feature_number = len(np.nonzero(features)[0]) nonzero_feature_number_train = len(np.nonzero(train_features)[0]) # Define placeholders placeholders = { 'support': tf.sparse_placeholder(tf.float32) , 'AXfeatures': tf.placeholder(tf.float32, shape=(None, features.shape[1])), 'labels': tf.placeholder(tf.float32, shape=(None, y_train.shape[1])), 'dropout': tf.placeholder_with_default(0., shape=()), 'num_features_nonzero': tf.placeholder(tf.int32) # helper variable for sparse dropout } # Create model model = model_func(placeholders, input_dim=features.shape[-1], logging=True) # Initialize session sess = tf.Session() # Define model evaluation function def evaluate(features, support, labels, placeholders): t_test = time.time() feed_dict_val = construct_feeddict_forMixlayers(features, support, labels, placeholders) outs_val = sess.run([model.loss, model.accuracy], feed_dict=feed_dict_val) return outs_val[0], outs_val[1], (time.time() - t_test) # Init variables sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() cost_val = [] p0 = column_prop(normADJ_train) # testSupport = [sparse_to_tuple(normADJ), sparse_to_tuple(normADJ)] valSupport = sparse_to_tuple(normADJ_val[len(train_index):, :]) testSupport = sparse_to_tuple(normADJ_test[len(train_index):, :]) t = time.time() maxACC = 0.0 # Train model for epoch in range(FLAGS.epochs): t1 = time.time() n = 0 for batch in iterate_minibatches_listinputs([normADJ_train, y_train], batchsize=1024, shuffle=True): [normADJ_batch, y_train_batch] = batch if rank1 is None: support1 = sparse_to_tuple(normADJ_batch) features_inputs = train_features else: distr = np.nonzero(np.sum(normADJ_batch, axis=0))[1] if rank1 > len(distr): q1 = distr else: q1 = np.random.choice(distr, rank1, replace=False, p=p0[distr]/sum(p0[distr])) # top layer support1 = sparse_to_tuple(normADJ_batch[:, q1].dot(sp.diags(1.0 / (p0[q1] * rank1)))) if len(support1[1])==0: continue features_inputs = train_features[q1, :] # selected nodes for approximation # Construct feed dictionary feed_dict = construct_feeddict_forMixlayers(features_inputs, support1, y_train_batch, placeholders) feed_dict.update({placeholders['dropout']: FLAGS.dropout}) # Training step outs = sess.run([model.opt_op, model.loss, model.accuracy], feed_dict=feed_dict) n = n +1 # Validation cost, acc, duration = evaluate(val_features, valSupport, y_val, placeholders) cost_val.append(cost) # if epoch > 50 and acc>maxACC: # maxACC = acc # save_path = saver.save(sess, "tmp/tmp_MixModel.ckpt") # Print results # print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(outs[1]), # "train_acc=", "{:.5f}".format(outs[2]), "val_loss=", "{:.5f}".format(cost), # "val_acc=", "{:.5f}".format(acc), "time per batch=", "{:.5f}".format((time.time() - t1)/n)) if epoch > FLAGS.early_stopping and np.mean(cost_val[-2:]) > np.mean(cost_val[-(FLAGS.early_stopping + 1):-1]): # print("Early stopping...") break train_duration = time.time() - t # Testing # if os.path.exists("tmp/pubmed_MixModel.ckpt"): # saver.restore(sess, "tmp/pubmed_MixModel.ckpt") test_cost, test_acc, test_duration = evaluate(test_features, testSupport, y_test, placeholders) print("rank1 = {}".format(rank1), "cost=", "{:.5f}".format(test_cost), "accuracy=", "{:.5f}".format(test_acc), "training time per epoch=", "{:.5f}".format(train_duration/(epoch+1)), "test time=", "{:.5f}".format(test_duration)) if __name__=="__main__": print("DATASET:", FLAGS.dataset) # main(None) main(50) # for k in [25, 50, 100, 200, 400]: # main(k) ================================================ FILE: pubmed_Mix_uniform.py ================================================ from __future__ import division from __future__ import print_function import time import tensorflow as tf import scipy.sparse as sp import os from utils import * from models import GCN_APPRO_Mix # Set random seed seed = 123 np.random.seed(seed) tf.set_random_seed(seed) # Settings flags = tf.app.flags FLAGS = flags.FLAGS flags.DEFINE_string('dataset', 'pubmed', 'Dataset string.') # 'cora', 'citeseer', 'pubmed' flags.DEFINE_string('model', 'gcn_mix', 'Model string.') # 'gcn_mix', 'gcn_appr' flags.DEFINE_float('learning_rate', 0.001, 'Initial learning rate.') flags.DEFINE_integer('epochs', 200, 'Number of epochs to train.') flags.DEFINE_integer('hidden1', 16, 'Number of units in hidden layer 1.') flags.DEFINE_float('dropout', 0.0, 'Dropout rate (1 - keep probability).') flags.DEFINE_float('weight_decay', 5e-4, 'Weight for L2 loss on embedding matrix.') flags.DEFINE_integer('early_stopping', 30, 'Tolerance for early stopping (# of epochs).') flags.DEFINE_integer('max_degree', 3, 'Maximum Chebyshev polynomial degree.') def construct_feeddict_forMixlayers(AXfeatures, support, labels, placeholders): feed_dict = dict() feed_dict.update({placeholders['labels']: labels}) feed_dict.update({placeholders['AXfeatures']: AXfeatures}) feed_dict.update({placeholders['support']: support}) feed_dict.update({placeholders['num_features_nonzero']: AXfeatures[1].shape}) return feed_dict def iterate_minibatches_listinputs(inputs, batchsize, shuffle=False): assert inputs is not None numSamples = inputs[0].shape[0] if shuffle: indices = np.arange(numSamples) np.random.shuffle(indices) for start_idx in range(0, numSamples - batchsize + 1, batchsize): if shuffle: excerpt = indices[start_idx:start_idx + batchsize] else: excerpt = slice(start_idx, start_idx + batchsize) yield [input[excerpt] for input in inputs] def main(rank1): adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask = load_data(FLAGS.dataset) train_index = np.where(train_mask)[0] adj_train = adj[train_index, :][:, train_index] train_mask = train_mask[train_index] y_train = y_train[train_index] val_index = np.where(val_mask)[0] y_val = y_val[val_index] test_index = np.where(test_mask)[0] y_test = y_test[test_index] train_val_index = np.concatenate([train_index, val_index],axis=0) train_test_idnex = np.concatenate([train_index, test_index],axis=0) numNode_train = adj_train.shape[0] # print("numNode", numNode) if FLAGS.model == 'gcn_mix': normADJ_train = nontuple_preprocess_adj(adj_train) # normADJ = nontuple_preprocess_adj(adj) normADJ_val = nontuple_preprocess_adj(adj[train_val_index,:][:,train_val_index]) normADJ_test = nontuple_preprocess_adj(adj[train_test_idnex,:][:,train_test_idnex]) num_supports = 2 model_func = GCN_APPRO_Mix else: raise ValueError('Invalid argument for model: ' + str(FLAGS.model)) # Some preprocessing features = nontuple_preprocess_features(features).todense() train_features = normADJ_train.dot(features[train_index]) val_features = normADJ_val.dot(features[train_val_index]) test_features = normADJ_test.dot(features[train_test_idnex]) nonzero_feature_number = len(np.nonzero(features)[0]) nonzero_feature_number_train = len(np.nonzero(train_features)[0]) # Define placeholders placeholders = { 'support': tf.sparse_placeholder(tf.float32) , 'AXfeatures': tf.placeholder(tf.float32, shape=(None, features.shape[1])), 'labels': tf.placeholder(tf.float32, shape=(None, y_train.shape[1])), 'dropout': tf.placeholder_with_default(0., shape=()), 'num_features_nonzero': tf.placeholder(tf.int32) # helper variable for sparse dropout } # Create model model = model_func(placeholders, input_dim=features.shape[-1], logging=True) # Initialize session sess = tf.Session() # Define model evaluation function def evaluate(features, support, labels, placeholders): t_test = time.time() feed_dict_val = construct_feeddict_forMixlayers(features, support, labels, placeholders) outs_val = sess.run([model.loss, model.accuracy], feed_dict=feed_dict_val) return outs_val[0], outs_val[1], (time.time() - t_test) # Init variables sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() cost_val = [] p0 = column_prop(normADJ_train) # testSupport = [sparse_to_tuple(normADJ), sparse_to_tuple(normADJ)] valSupport = sparse_to_tuple(normADJ_val[len(train_index):, :]) testSupport = sparse_to_tuple(normADJ_test[len(train_index):, :]) t = time.time() maxACC = 0.0 # Train model for epoch in range(FLAGS.epochs): t1 = time.time() n = 0 for batch in iterate_minibatches_listinputs([normADJ_train, y_train], batchsize=1024, shuffle=True): [normADJ_batch, y_train_batch] = batch p1 = column_prop(normADJ_batch) if rank1 is None: support1 = sparse_to_tuple(normADJ_batch) features_inputs = train_features else: distr = np.nonzero(np.sum(normADJ_batch, axis=0))[1] if rank1 > len(distr): q1 = distr else: q1 = np.random.choice(distr, rank1, replace=False) # top layer # q1 = np.random.choice(np.arange(numNode_train), rank1) # top layer support1 = sparse_to_tuple(normADJ_batch[:, q1] * numNode_train / len(q1)) features_inputs = train_features[q1, :] # selected nodes for approximation # Construct feed dictionary feed_dict = construct_feeddict_forMixlayers(features_inputs, support1, y_train_batch, placeholders) feed_dict.update({placeholders['dropout']: FLAGS.dropout}) # Training step outs = sess.run([model.opt_op, model.loss, model.accuracy], feed_dict=feed_dict) n = n +1 # Validation cost, acc, duration = evaluate(val_features, valSupport, y_val, placeholders) cost_val.append(cost) # if epoch > 50 and acc>maxACC: # maxACC = acc # save_path = saver.save(sess, "tmp/tmp_MixModel.ckpt") # Print results # print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(outs[1]), # "train_acc=", "{:.5f}".format(outs[2]), "val_loss=", "{:.5f}".format(cost), # "val_acc=", "{:.5f}".format(acc), "time per batch=", "{:.5f}".format((time.time() - t1)/n)) if epoch > FLAGS.early_stopping and np.mean(cost_val[-2:]) > np.mean(cost_val[-(FLAGS.early_stopping + 1):-1]): # print("Early stopping...") break train_duration = time.time() - t # Testing # if os.path.exists("tmp/pubmed_MixModel.ckpt"): # saver.restore(sess, "tmp/pubmed_MixModel.ckpt") test_cost, test_acc, test_duration = evaluate(test_features, testSupport, y_test, placeholders) print("rank1 = {}".format(rank1), "cost=", "{:.5f}".format(test_cost), "accuracy=", "{:.5f}".format(test_acc), "training time per epoch=", "{:.5f}".format(train_duration/(epoch+1)), "test time=", "{:.5f}".format(test_duration)) if __name__=="__main__": print("DATASET:", FLAGS.dataset) main(5) # for k in [25, 50, 100, 200, 400]: # main(k) ================================================ FILE: pubmed_inductive_appr2layers.py ================================================ from __future__ import division from __future__ import print_function import time import tensorflow as tf import scipy.sparse as sp from utils import * from models import GCN, MLP, GCN_APPRO # Set random seed seed = 123 np.random.seed(seed) tf.set_random_seed(seed) # Settings flags = tf.app.flags FLAGS = flags.FLAGS flags.DEFINE_string('dataset', 'pubmed', 'Dataset string.') # 'cora', 'citeseer', 'pubmed' flags.DEFINE_string('model', 'gcn_appr', 'Model string.') # 'gcn', 'gcn_appr' flags.DEFINE_float('learning_rate', 0.001, 'Initial learning rate.') flags.DEFINE_integer('epochs', 200, 'Number of epochs to train.') flags.DEFINE_integer('hidden1', 16, 'Number of units in hidden layer 1.') flags.DEFINE_float('dropout', 0.0, 'Dropout rate (1 - keep probability).') flags.DEFINE_float('weight_decay', 5e-4, 'Weight for L2 loss on embedding matrix.') flags.DEFINE_integer('early_stopping', 30, 'Tolerance for early stopping (# of epochs).') flags.DEFINE_integer('max_degree', 3, 'Maximum Chebyshev polynomial degree.') # Load data def iterate_minibatches_listinputs(inputs, batchsize, shuffle=False): assert inputs is not None numSamples = inputs[0].shape[0] if shuffle: indices = np.arange(numSamples) np.random.shuffle(indices) for start_idx in range(0, numSamples - batchsize + 1, batchsize): if shuffle: excerpt = indices[start_idx:start_idx + batchsize] else: excerpt = slice(start_idx, start_idx + batchsize) yield [input[excerpt] for input in inputs] def main(rank1, rank0): adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask = load_data(FLAGS.dataset) train_index = np.where(train_mask)[0] adj_train = adj[train_index, :][:, train_index] train_mask = train_mask[train_index] y_train = y_train[train_index] val_index = np.where(val_mask)[0] # adj_val = adj[val_index, :][:, val_index] val_mask = val_mask[val_index] y_val = y_val[val_index] test_index = np.where(test_mask)[0] # adj_test = adj[test_index, :][:, test_index] test_mask = test_mask[test_index] y_test = y_test[test_index] numNode_train = adj_train.shape[0] # print("numNode", numNode) # Some preprocessing features = nontuple_preprocess_features(features).todense() train_features = features[train_index] if FLAGS.model == 'gcn_appr': normADJ_train = nontuple_preprocess_adj(adj_train) normADJ = nontuple_preprocess_adj(adj) # normADJ_val = nontuple_preprocess_adj(adj_val) # normADJ_test = nontuple_preprocess_adj(adj_test) num_supports = 2 model_func = GCN_APPRO else: raise ValueError('Invalid argument for model: ' + str(FLAGS.model)) # Define placeholders placeholders = { 'support': [tf.sparse_placeholder(tf.float32) for _ in range(num_supports)], 'features': tf.placeholder(tf.float32, shape=(None, features.shape[1])), 'labels': tf.placeholder(tf.float32, shape=(None, y_train.shape[1])), 'labels_mask': tf.placeholder(tf.int32), 'dropout': tf.placeholder_with_default(0., shape=()), 'num_features_nonzero': tf.placeholder(tf.int32) # helper variable for sparse dropout } # Create model model = model_func(placeholders, input_dim=features.shape[-1], logging=True) # Initialize session sess = tf.Session() # Define model evaluation function def evaluate(features, support, labels, mask, placeholders): t_test = time.time() feed_dict_val = construct_feed_dict(features, support, labels, mask, placeholders) outs_val = sess.run([model.loss, model.accuracy], feed_dict=feed_dict_val) return outs_val[0], outs_val[1], (time.time() - t_test) # Init variables sess.run(tf.global_variables_initializer()) cost_val = [] p0 = column_prop(normADJ_train) # testSupport = [sparse_to_tuple(normADJ), sparse_to_tuple(normADJ)] valSupport = [sparse_to_tuple(normADJ), sparse_to_tuple(normADJ[val_index, :])] testSupport = [sparse_to_tuple(normADJ), sparse_to_tuple(normADJ[test_index, :])] t = time.time() # Train model for epoch in range(FLAGS.epochs): t1 = time.time() n = 0 for batch in iterate_minibatches_listinputs([normADJ_train, y_train, train_mask], batchsize=256, shuffle=True): [normADJ_batch, y_train_batch, train_mask_batch] = batch if sum(train_mask_batch) < 1: continue p1 = column_prop(normADJ_batch) q1 = np.random.choice(np.arange(numNode_train), rank1, p=p1) # top layer # q0 = np.random.choice(np.arange(numNode_train), rank0, p=p0) # bottom layer support1 = sparse_to_tuple(normADJ_batch[:, q1].dot(sp.diags(1.0 / (p1[q1] * rank1)))) p2 = column_prop(normADJ_train[q1, :]) q0 = np.random.choice(np.arange(numNode_train), rank0, p=p2) support0 = sparse_to_tuple(normADJ_train[q1, :][:, q0]) features_inputs = sp.diags(1.0 / (p2[q0] * rank0)).dot(train_features[q0, :]) # selected nodes for approximation # Construct feed dictionary feed_dict = construct_feed_dict(features_inputs, [support0, support1], y_train_batch, train_mask_batch, placeholders) feed_dict.update({placeholders['dropout']: FLAGS.dropout}) # Training step outs = sess.run([model.opt_op, model.loss, model.accuracy], feed_dict=feed_dict) # Validation cost, acc, duration = evaluate(features, valSupport, y_val, val_mask, placeholders) cost_val.append(cost) # # Print results print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(outs[1]), "train_acc=", "{:.5f}".format(outs[2]), "val_loss=", "{:.5f}".format(cost), "val_acc=", "{:.5f}".format(acc), "time=", "{:.5f}".format(time.time() - t1)) if epoch > FLAGS.early_stopping and cost_val[-1] > np.mean(cost_val[-(FLAGS.early_stopping + 1):-1]): # print("Early stopping...") break train_duration = time.time() - t # Testing test_cost, test_acc, test_duration = evaluate(features, testSupport, y_test, test_mask, placeholders) print("rank1 = {}".format(rank1), "rank0 = {}".format(rank0), "cost=", "{:.5f}".format(test_cost), "accuracy=", "{:.5f}".format(test_acc), "training time per epoch=", "{:.5f}".format(train_duration/epoch)) if __name__=="__main__": print("DATASET:", FLAGS.dataset) for k in [5, 10, 25, 50]: main(k, k) # main(50,50) # for k in [50, 100, 200, 400]: # main(k, k) ================================================ FILE: train.py ================================================ from __future__ import division from __future__ import print_function import time import tensorflow as tf from utils import * from models import GCN, MLP # Set random seed seed = 123 np.random.seed(seed) tf.set_random_seed(seed) # Settings flags = tf.app.flags FLAGS = flags.FLAGS flags.DEFINE_string('dataset', 'pubmed', 'Dataset string.') # 'cora', 'citeseer', 'pubmed' flags.DEFINE_string('model', 'gcn', 'Model string.') # 'gcn', 'gcn_cheby', 'dense' flags.DEFINE_float('learning_rate', 0.01, 'Initial learning rate.') flags.DEFINE_integer('epochs', 200, 'Number of epochs to train.') flags.DEFINE_integer('hidden1', 16, 'Number of units in hidden layer 1.') flags.DEFINE_float('dropout', 0.5, 'Dropout rate (1 - keep probability).') flags.DEFINE_float('weight_decay', 5e-4, 'Weight for L2 loss on embedding matrix.') flags.DEFINE_integer('early_stopping', 10, 'Tolerance for early stopping (# of epochs).') flags.DEFINE_integer('max_degree', 3, 'Maximum Chebyshev polynomial degree.') # Load data adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask = load_data(FLAGS.dataset) # Some preprocessing features = preprocess_features(features) if FLAGS.model == 'gcn': support = [preprocess_adj(adj)] num_supports = 1 model_func = GCN elif FLAGS.model == 'gcn_cheby': support = chebyshev_polynomials(adj, FLAGS.max_degree) num_supports = 1 + FLAGS.max_degree model_func = GCN elif FLAGS.model == 'dense': support = [preprocess_adj(adj)] # Not used num_supports = 1 model_func = MLP else: raise ValueError('Invalid argument for model: ' + str(FLAGS.model)) # Define placeholders placeholders = { 'support': [tf.sparse_placeholder(tf.float32) for _ in range(num_supports)], 'features': tf.sparse_placeholder(tf.float32, shape=tf.constant(features[2], dtype=tf.int64)), 'labels': tf.placeholder(tf.float32, shape=(None, y_train.shape[1])), 'labels_mask': tf.placeholder(tf.int32), 'dropout': tf.placeholder_with_default(0., shape=()), 'num_features_nonzero': tf.placeholder(tf.int32) # helper variable for sparse dropout } # Create model model = model_func(placeholders, input_dim=features[2][1], logging=True) print(adj.shape[0]) # Initialize session sess = tf.Session() # Define model evaluation function def evaluate(features, support, labels, mask, placeholders): t_test = time.time() feed_dict_val = construct_feed_dict(features, support, labels, mask, placeholders) outs_val = sess.run([model.loss, model.accuracy], feed_dict=feed_dict_val) return outs_val[0], outs_val[1], (time.time() - t_test) # Init variables sess.run(tf.global_variables_initializer()) cost_val = [] t_start = time.time() # Train model for epoch in range(FLAGS.epochs): t = time.time() # Construct feed dictionary feed_dict = construct_feed_dict(features, support, y_train, train_mask, placeholders) feed_dict.update({placeholders['dropout']: FLAGS.dropout}) # Training step outs = sess.run([model.opt_op, model.loss, model.accuracy], feed_dict=feed_dict) # Validation cost, acc, duration = evaluate(features, support, y_val, val_mask, placeholders) cost_val.append(cost) # Print results print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(outs[1]), "train_acc=", "{:.5f}".format(outs[2]), "val_loss=", "{:.5f}".format(cost), "val_acc=", "{:.5f}".format(acc), "time=", "{:.5f}".format(time.time() - t)) # if epoch % 5 == 0: # # Validation # test_cost, test_acc, test_duration = evaluate(features, support, y_test, test_mask, placeholders) # print("training time by far=", "{:.5f}".format(time.time() - t_start), # "epoch = {}".format(epoch + 1), # "cost=", "{:.5f}".format(test_cost), # "accuracy=", "{:.5f}".format(test_acc)) if epoch > FLAGS.early_stopping and cost_val[-1] > np.mean(cost_val[-(FLAGS.early_stopping+1):-1]): print("Early stopping...") break # print("Optimization Finished!") train_duration = time.time()-t_start # Testing test_cost, test_acc, test_duration = evaluate(features, support, y_test, test_mask, placeholders) print("Original test set results:", "cost=", "{:.5f}".format(test_cost), "accuracy=", "{:.5f}".format(test_acc), "training time =", "{:.5f}".format(train_duration), "training time per epoch=", "{:.5f}".format(train_duration/(epoch+1)), "test time=", "{:.5f}".format(test_duration)) ================================================ FILE: train_batch_multiRank_inductive_newscheme.py ================================================ from __future__ import division from __future__ import print_function import time import tensorflow as tf import scipy.sparse as sp from utils import * from models import GCN, MLP, GCN_APPRO # Set random seed seed = 123 np.random.seed(seed) tf.set_random_seed(seed) # Settings flags = tf.app.flags FLAGS = flags.FLAGS flags.DEFINE_string('dataset', 'pubmed', 'Dataset string.') # 'cora', 'citeseer', 'pubmed' flags.DEFINE_string('model', 'gcn_appr', 'Model string.') # 'gcn', 'gcn_appr' flags.DEFINE_float('learning_rate', 0.01, 'Initial learning rate.') flags.DEFINE_integer('epochs', 300, 'Number of epochs to train.') flags.DEFINE_integer('hidden1', 16, 'Number of units in hidden layer 1.') flags.DEFINE_float('dropout', 0.5, 'Dropout rate (1 - keep probability).') flags.DEFINE_float('weight_decay', 5e-4, 'Weight for L2 loss on embedding matrix.') flags.DEFINE_integer('early_stopping', 30, 'Tolerance for early stopping (# of epochs).') flags.DEFINE_integer('max_degree', 3, 'Maximum Chebyshev polynomial degree.') rank1 = 300 rank0 = 300 # Load data def iterate_minibatches_listinputs(inputs, batchsize, shuffle=False): assert inputs is not None numSamples = inputs[0].shape[0] if shuffle: indices = np.arange(numSamples) np.random.shuffle(indices) for start_idx in range(0, numSamples - batchsize + 1, batchsize): if shuffle: excerpt = indices[start_idx:start_idx + batchsize] else: excerpt = slice(start_idx, start_idx + batchsize) yield [input[excerpt] for input in inputs] def main(rank1, rank0): adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask = load_data(FLAGS.dataset) train_index = np.where(train_mask)[0] adj_train = adj[train_index, :][:, train_index] train_mask = train_mask[train_index] y_train = y_train[train_index] val_index = np.where(val_mask)[0] # adj_val = adj[val_index, :][:, val_index] # val_mask = val_mask[val_index] # y_val = y_val[val_index] # test_index = np.where(test_mask)[0] # adj_test = adj[test_index, :][:, test_index] # test_mask = test_mask[test_index] # y_test = y_test[test_index] numNode_train = adj_train.shape[0] # print("numNode", numNode) # Some preprocessing features = nontuple_preprocess_features(features).todense() train_features = features[train_index] if FLAGS.model == 'gcn_appr': normADJ_train = nontuple_preprocess_adj(adj_train) normADJ = nontuple_preprocess_adj(adj) # normADJ_val = nontuple_preprocess_adj(adj_val) # normADJ_test = nontuple_preprocess_adj(adj_test) num_supports = 2 model_func = GCN_APPRO else: raise ValueError('Invalid argument for model: ' + str(FLAGS.model)) # Define placeholders placeholders = { 'support': [tf.sparse_placeholder(tf.float32) for _ in range(num_supports)], 'features': tf.placeholder(tf.float32, shape=(None, features.shape[1])), 'labels': tf.placeholder(tf.float32, shape=(None, y_train.shape[1])), 'labels_mask': tf.placeholder(tf.int32), 'dropout': tf.placeholder_with_default(0., shape=()), 'num_features_nonzero': tf.placeholder(tf.int32) # helper variable for sparse dropout } # Create model model = model_func(placeholders, input_dim=features.shape[-1], logging=True) # Initialize session sess = tf.Session() # Define model evaluation function def evaluate(features, support, labels, mask, placeholders): t_test = time.time() feed_dict_val = construct_feed_dict(features, support, labels, mask, placeholders) outs_val = sess.run([model.loss, model.accuracy], feed_dict=feed_dict_val) return outs_val[0], outs_val[1], (time.time() - t_test) # Init variables sess.run(tf.global_variables_initializer()) cost_val = [] p0 = column_prop(normADJ_train) p1 = mix_prop(normADJ_train, features[train_index, :]) testSupport = [sparse_to_tuple(normADJ), sparse_to_tuple(normADJ)] # valSupport = [sparse_to_tuple(normADJ_val), sparse_to_tuple(normADJ_val)] # testSupport = [sparse_to_tuple(normADJ_test), sparse_to_tuple(normADJ_test)] t = time.time() # Train model for epoch in range(FLAGS.epochs): n = 0 for batch in iterate_minibatches_listinputs([normADJ_train, y_train, train_mask], batchsize=50, shuffle=True): [normADJ_batch, y_train_batch, train_mask_batch] = batch if sum(train_mask_batch) < 1: continue # p1 = column_prop(normADJ_batch) q1 = np.random.choice(np.arange(numNode_train), rank1, p=p0) # top layer q0 = np.random.choice(np.arange(numNode_train), rank0, p=p0) # bottom layer support1 = sparse_to_tuple(normADJ_batch[:, q1].dot(sp.diags(1.0 / (p1[q1] * rank1)))) support0 = sparse_to_tuple(normADJ_train[q1, :][:, q0]) # support1 = sparse_to_tuple(normADJ_batch) # support0 = sparse_to_tuple(normADJ[:, q0]) features_inputs = sp.diags(1.0 / (p1[q0] * rank0)).dot(train_features[q0, :]) # selected nodes for approximation # Construct feed dictionary feed_dict = construct_feed_dict(features_inputs, [support0, support1], y_train_batch, train_mask_batch, placeholders) feed_dict.update({placeholders['dropout']: FLAGS.dropout}) # Training step outs = sess.run([model.opt_op, model.loss, model.accuracy], feed_dict=feed_dict) # Validation cost, acc, duration = evaluate(features, testSupport, y_val, val_mask, placeholders) cost_val.append(cost) # # Print results # print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(outs[1]), # "train_acc=", "{:.5f}".format(outs[2]), "val_loss=", "{:.5f}".format(cost), # "val_acc=", "{:.5f}".format(acc), "time=", "{:.5f}".format(time.time() - t)) if epoch > FLAGS.early_stopping and cost_val[-1] > np.mean(cost_val[-(FLAGS.early_stopping + 1):-1]): # print("Early stopping...") break train_duration = time.time() - t # Testing test_cost, test_acc, test_duration = evaluate(features, testSupport, y_test, test_mask, placeholders) print("rank1 = {}".format(rank1), "rank0 = {}".format(rank0), "cost=", "{:.5f}".format(test_cost), "accuracy=", "{:.5f}".format(test_acc), "training time per epoch=", "{:.5f}".format(train_duration/epoch)) if __name__=="__main__": print("DATASET:", FLAGS.dataset) for k in range(100, 1000, 200): main(k, k) ================================================ FILE: train_batch_multiRank_inductive_reddit_Mixlayers_sampleA.py ================================================ from __future__ import division from __future__ import print_function import time import tensorflow as tf import scipy.sparse as sp from utils import * from models import GCN_APPRO_Mix import json from networkx.readwrite import json_graph import os # Set random seed seed = 123 np.random.seed(seed) tf.set_random_seed(seed) # Settings flags = tf.app.flags FLAGS = flags.FLAGS flags.DEFINE_string('dataset', 'pubmed', 'Dataset string.') # 'cora', 'citeseer', 'pubmed' flags.DEFINE_string('model', 'gcn_mix', 'Model string.') # 'gcn', 'gcn_appr' flags.DEFINE_float('learning_rate', 0.01, 'Initial learning rate.') flags.DEFINE_integer('epochs', 200, 'Number of epochs to train.') flags.DEFINE_integer('hidden1', 128, 'Number of units in hidden layer 1.') flags.DEFINE_float('dropout', 0.0, 'Dropout rate (1 - keep probability).') flags.DEFINE_float('weight_decay', 1e-4, 'Weight for L2 loss on embedding matrix.') flags.DEFINE_integer('early_stopping', 30, 'Tolerance for early stopping (# of epochs).') flags.DEFINE_integer('max_degree', 3, 'Maximum Chebyshev polynomial degree.') # Load data def iterate_minibatches_listinputs(inputs, batchsize, shuffle=False): assert inputs is not None numSamples = inputs[0].shape[0] if shuffle: indices = np.arange(numSamples) np.random.shuffle(indices) for start_idx in range(0, numSamples - batchsize + 1, batchsize): if shuffle: excerpt = indices[start_idx:start_idx + batchsize] else: excerpt = slice(start_idx, start_idx + batchsize) yield [input[excerpt] for input in inputs] def loadRedditFromG(dataset_dir, inputfile): f= open(dataset_dir+inputfile) objects = [] for _ in range(pkl.load(f)): objects.append(pkl.load(f)) adj, train_labels, val_labels, test_labels, train_index, val_index, test_index = tuple(objects) feats = np.load(dataset_dir + "/reddit-feats.npy") return sp.csr_matrix(adj), sp.lil_matrix(feats), train_labels, val_labels, test_labels, train_index, val_index, test_index def loadRedditFromNPZ(dataset_dir): adj = sp.load_npz(dataset_dir+"reddit_adj.npz") data = np.load(dataset_dir+"reddit.npz") return adj, data['feats'], data['y_train'], data['y_val'], data['y_test'], data['train_index'], data['val_index'], data['test_index'] def transferRedditDataFormat(dataset_dir, output_file): G = json_graph.node_link_graph(json.load(open(dataset_dir + "/reddit-G.json"))) labels = json.load(open(dataset_dir + "/reddit-class_map.json")) train_ids = [n for n in G.nodes() if not G.node[n]['val'] and not G.node[n]['test']] test_ids = [n for n in G.nodes() if G.node[n]['test']] val_ids = [n for n in G.nodes() if G.node[n]['val']] train_labels = [labels[i] for i in train_ids] test_labels = [labels[i] for i in test_ids] val_labels = [labels[i] for i in val_ids] feats = np.load(dataset_dir + "/reddit-feats.npy") ## Logistic gets thrown off by big counts, so log transform num comments and score feats[:, 0] = np.log(feats[:, 0] + 1.0) feats[:, 1] = np.log(feats[:, 1] - min(np.min(feats[:, 1]), -1)) feat_id_map = json.load(open(dataset_dir + "reddit-id_map.json")) feat_id_map = {id: val for id, val in feat_id_map.iteritems()} # train_feats = feats[[feat_id_map[id] for id in train_ids]] # test_feats = feats[[feat_id_map[id] for id in test_ids]] # numNode = len(feat_id_map) # adj = sp.lil_matrix(np.zeros((numNode,numNode))) # for edge in G.edges(): # adj[feat_id_map[edge[0]], feat_id_map[edge[1]]] = 1 train_index = [feat_id_map[id] for id in train_ids] val_index = [feat_id_map[id] for id in val_ids] test_index = [feat_id_map[id] for id in test_ids] np.savez(output_file, feats = feats, y_train=train_labels, y_val=val_labels, y_test = test_labels, train_index = train_index, val_index=val_index, test_index = test_index) def transferLabel2Onehot(labels, N): y = np.zeros((len(labels),N)) for i in range(len(labels)): pos = labels[i] y[i,pos] =1 return y def construct_feeddict_forMixlayers(AXfeatures, support, labels, placeholders): feed_dict = dict() feed_dict.update({placeholders['labels']: labels}) feed_dict.update({placeholders['AXfeatures']: AXfeatures}) feed_dict.update({placeholders['support']: support}) feed_dict.update({placeholders['num_features_nonzero']: AXfeatures[1].shape}) return feed_dict def main(rank1): # config = tf.ConfigProto(device_count={"CPU": 4}, # limit to num_cpu_core CPU usage # inter_op_parallelism_threads = 1, # intra_op_parallelism_threads = 4, # log_device_placement=False) adj, features, y_train, y_val, y_test,train_index, val_index, test_index = loadRedditFromNPZ("data/") adj = adj+adj.T y_train = transferLabel2Onehot(y_train, 41) y_val = transferLabel2Onehot(y_val, 41) y_test = transferLabel2Onehot(y_test, 41) features = sp.lil_matrix(features) adj_train = adj[train_index, :][:, train_index] numNode_train = adj_train.shape[0] # print("numNode", numNode) if FLAGS.model == 'gcn_mix': normADJ_train = nontuple_preprocess_adj(adj_train) normADJ = nontuple_preprocess_adj(adj) # normADJ_val = nontuple_preprocess_adj(adj_val) # normADJ_test = nontuple_preprocess_adj(adj_test) num_supports = 2 model_func = GCN_APPRO_Mix else: raise ValueError('Invalid argument for model: ' + str(FLAGS.model)) # Some preprocessing features = nontuple_preprocess_features(features).todense() train_features = normADJ_train.dot(features[train_index]) features = normADJ.dot(features) nonzero_feature_number = len(np.nonzero(features)[0]) nonzero_feature_number_train = len(np.nonzero(train_features)[0]) # Define placeholders placeholders = { 'support': tf.sparse_placeholder(tf.float32) , 'AXfeatures': tf.placeholder(tf.float32, shape=(None, features.shape[1])), 'labels': tf.placeholder(tf.float32, shape=(None, y_train.shape[1])), 'dropout': tf.placeholder_with_default(0., shape=()), 'num_features_nonzero': tf.placeholder(tf.int32) # helper variable for sparse dropout } # Create model model = model_func(placeholders, input_dim=features.shape[-1], logging=True) # Initialize session sess = tf.Session() # Define model evaluation function def evaluate(features, support, labels, placeholders): t_test = time.time() feed_dict_val = construct_feeddict_forMixlayers(features, support, labels, placeholders) outs_val = sess.run([model.loss, model.accuracy], feed_dict=feed_dict_val) return outs_val[0], outs_val[1], (time.time() - t_test) # Init variables sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() cost_val = [] p0 = column_prop(normADJ_train) # testSupport = [sparse_to_tuple(normADJ), sparse_to_tuple(normADJ)] valSupport = sparse_to_tuple(normADJ[val_index, :]) testSupport = sparse_to_tuple(normADJ[test_index, :]) t = time.time() maxACC = 0.0 # Train model for epoch in range(FLAGS.epochs): t1 = time.time() n = 0 for batch in iterate_minibatches_listinputs([normADJ_train, y_train], batchsize=256, shuffle=True): [normADJ_batch, y_train_batch] = batch # p1 = column_prop(normADJ_batch) if rank1 is None: support1 = sparse_to_tuple(normADJ_batch) features_inputs = train_features else: distr = np.nonzero(np.sum(normADJ_batch, axis=0))[1] if rank1 > len(distr): q1 = distr else: q1 = np.random.choice(distr, rank1, replace=False, p=p0[distr]/sum(p0[distr])) # top layer # q1 = np.random.choice(np.arange(numNode_train), rank1, p=p0) # top layer support1 = sparse_to_tuple(normADJ_batch[:, q1].dot(sp.diags(1.0 / (p0[q1] * rank1)))) if len(support1[1])==0: continue features_inputs = train_features[q1, :] # selected nodes for approximation # Construct feed dictionary feed_dict = construct_feeddict_forMixlayers(features_inputs, support1, y_train_batch, placeholders) feed_dict.update({placeholders['dropout']: FLAGS.dropout}) # Training step outs = sess.run([model.opt_op, model.loss, model.accuracy], feed_dict=feed_dict) n = n+1 # Validation cost, acc, duration = evaluate(features, valSupport, y_val, placeholders) cost_val.append(cost) if epoch > 20 and acc>maxACC: maxACC = acc saver.save(sess, "tmp/tmp_MixModel_sampleA_full.ckpt") # Print results print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(outs[1]), "train_acc=", "{:.5f}".format(outs[2]), "val_loss=", "{:.5f}".format(cost), "val_acc=", "{:.5f}".format(acc), "time per batch=", "{:.5f}".format((time.time() - t1)/n)) if epoch%5==0: # Validation test_cost, test_acc, test_duration = evaluate(features, testSupport, y_test, placeholders) print("training time by far=", "{:.5f}".format(time.time() - t), "epoch = {}".format(epoch + 1), "cost=", "{:.5f}".format(test_cost), "accuracy=", "{:.5f}".format(test_acc)) if epoch > FLAGS.early_stopping and np.mean(cost_val[-2:]) > np.mean(cost_val[-(FLAGS.early_stopping + 1):-1]): # print("Early stopping...") break train_duration = time.time() - t # Testing if os.path.exists("tmp/tmp_MixModel_sampleA_full.ckpt.index"): saver.restore(sess, "tmp/tmp_MixModel_sampleA_full.ckpt") test_cost, test_acc, test_duration = evaluate(features, testSupport, y_test, placeholders) print("rank1 = {}".format(rank1), "cost=", "{:.5f}".format(test_cost), "accuracy=", "{:.5f}".format(test_acc), "training time=", "{:.5f}".format(train_duration), "epoch = {}".format(epoch+1), "test time=", "{:.5f}".format(test_duration)) def transferG2ADJ(): G = json_graph.node_link_graph(json.load(open("reddit/reddit-G.json"))) feat_id_map = json.load(open("reddit/reddit-id_map.json")) feat_id_map = {id: val for id, val in feat_id_map.iteritems()} numNode = len(feat_id_map) adj = np.zeros((numNode, numNode)) newEdges0 = [feat_id_map[edge[0]] for edge in G.edges()] newEdges1 = [feat_id_map[edge[1]] for edge in G.edges()] # for edge in G.edges(): # adj[feat_id_map[edge[0]], feat_id_map[edge[1]]] = 1 adj = sp.csr_matrix((np.ones((len(newEdges0),)), (newEdges0, newEdges1)), shape=(numNode, numNode)) sp.save_npz("reddit_adj.npz", adj) def test(rank1=None): # config = tf.ConfigProto(device_count={"CPU": 4}, # limit to num_cpu_core CPU usage # inter_op_parallelism_threads = 1, # intra_op_parallelism_threads = 4, # log_device_placement=False) adj, features, y_train, y_val, y_test, train_index, val_index, test_index = loadRedditFromNPZ("data/") adj = adj + adj.T y_train = transferLabel2Onehot(y_train, 41) y_test = transferLabel2Onehot(y_test, 41) features = sp.lil_matrix(features) numNode_train = y_train.shape[0] # print("numNode", numNode) if FLAGS.model == 'gcn_mix': normADJ = nontuple_preprocess_adj(adj) normADJ_test = normADJ[test_index, :] # normADJ_val = nontuple_preprocess_adj(adj_val) # normADJ_test = nontuple_preprocess_adj(adj_test) num_supports = 2 model_func = GCN_APPRO_Mix else: raise ValueError('Invalid argument for model: ' + str(FLAGS.model)) # Some preprocessing features = nontuple_preprocess_features(features).todense() features = normADJ.dot(features) # Define placeholders placeholders = { 'support': tf.sparse_placeholder(tf.float32), 'AXfeatures': tf.placeholder(tf.float32, shape=(None, features.shape[1])), 'labels': tf.placeholder(tf.float32, shape=(None, y_train.shape[1])), 'dropout': tf.placeholder_with_default(0., shape=()), 'num_features_nonzero': tf.placeholder(tf.int32) # helper variable for sparse dropout } # Create model model = model_func(placeholders, input_dim=features.shape[-1], logging=True) # Initialize session sess = tf.Session() # Define model evaluation function def evaluate(features, support, labels, placeholders): t_test = time.time() feed_dict_val = construct_feeddict_forMixlayers(features, support, labels, placeholders) outs_val = sess.run([model.loss, model.accuracy], feed_dict=feed_dict_val) return outs_val[0], outs_val[1], (time.time() - t_test) # Init variables sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(sess, "tmp/tmp_MixModel_sampleA.ckpt") cost_val = [] p0 = column_prop(normADJ_test) t = time.time() if rank1 is None: support1 = sparse_to_tuple(normADJ_test) features_inputs = features else: distr = np.nonzero(np.sum(normADJ_test, axis=0))[1] if rank1 > len(distr): q1 = distr else: q1 = np.random.choice(distr, rank1, replace=False, p=p0[distr] / sum(p0[distr])) # top layer # q1 = np.random.choice(np.arange(numNode_train), rank1, p=p0) # top layer support1 = sparse_to_tuple(normADJ_test[:, q1].dot(sp.diags(1.0 / (p0[q1] * rank1)))) features_inputs = features[q1, :] # selected nodes for approximation test_cost, test_acc, test_duration = evaluate(features_inputs, support1, y_test, placeholders) test_duration = time.time() - t print("rank1 = {}".format(rank1), "cost=", "{:.5f}".format(test_cost), "accuracy=", "{:.5f}".format(test_acc), "test time=", "{:.5f}".format(test_duration)) if __name__=="__main__": # main(None) main(None) # for k in [25, 50, 100, 200, 400]: # main(k) ================================================ FILE: train_batch_multiRank_inductive_reddit_Mixlayers_sampleBatch.py ================================================ from __future__ import division from __future__ import print_function import time import tensorflow as tf import scipy.sparse as sp from utils import * from models import GCN_APPRO_Mix import json from networkx.readwrite import json_graph import os # Set random seed seed = 123 np.random.seed(seed) tf.set_random_seed(seed) # Settings flags = tf.app.flags FLAGS = flags.FLAGS flags.DEFINE_string('model', 'gcn_mix', 'Model string.') # 'gcn', 'gcn_appr' flags.DEFINE_float('learning_rate', 0.001, 'Initial learning rate.') flags.DEFINE_integer('epochs', 200, 'Number of epochs to train.') flags.DEFINE_integer('hidden1', 128, 'Number of units in hidden layer 1.') flags.DEFINE_float('dropout', 0.0, 'Dropout rate (1 - keep probability).') flags.DEFINE_float('weight_decay', 1e-4, 'Weight for L2 loss on embedding matrix.') flags.DEFINE_integer('early_stopping', 100, 'Tolerance for early stopping (# of epochs).') flags.DEFINE_integer('max_degree', 3, 'Maximum Chebyshev polynomial degree.') # Load data def iterate_minibatches_listinputs(inputs, batchsize, shuffle=False): assert inputs is not None numSamples = inputs[0].shape[0] if shuffle: indices = np.arange(numSamples) np.random.shuffle(indices) for start_idx in range(0, numSamples - batchsize + 1, batchsize): if shuffle: excerpt = indices[start_idx:start_idx + batchsize] else: excerpt = slice(start_idx, start_idx + batchsize) yield [input[excerpt] for input in inputs] def loadRedditFromG(dataset_dir, inputfile): f= open(dataset_dir+inputfile) objects = [] for _ in range(pkl.load(f)): objects.append(pkl.load(f)) adj, train_labels, val_labels, test_labels, train_index, val_index, test_index = tuple(objects) feats = np.load(dataset_dir + "/reddit-feats.npy") return sp.csr_matrix(adj), sp.lil_matrix(feats), train_labels, val_labels, test_labels, train_index, val_index, test_index def loadRedditFromNPZ(dataset_dir): adj = sp.load_npz(dataset_dir+"reddit_adj.npz") data = np.load(dataset_dir+"reddit.npz") return adj, data['feats'], data['y_train'], data['y_val'], data['y_test'], data['train_index'], data['val_index'], data['test_index'] def transferRedditDataFormat(dataset_dir, output_file): G = json_graph.node_link_graph(json.load(open(dataset_dir + "/reddit-G.json"))) labels = json.load(open(dataset_dir + "/reddit-class_map.json")) train_ids = [n for n in G.nodes() if not G.node[n]['val'] and not G.node[n]['test']] test_ids = [n for n in G.nodes() if G.node[n]['test']] val_ids = [n for n in G.nodes() if G.node[n]['val']] train_labels = [labels[i] for i in train_ids] test_labels = [labels[i] for i in test_ids] val_labels = [labels[i] for i in val_ids] feats = np.load(dataset_dir + "/reddit-feats.npy") ## Logistic gets thrown off by big counts, so log transform num comments and score feats[:, 0] = np.log(feats[:, 0] + 1.0) feats[:, 1] = np.log(feats[:, 1] - min(np.min(feats[:, 1]), -1)) feat_id_map = json.load(open(dataset_dir + "reddit-id_map.json")) feat_id_map = {id: val for id, val in feat_id_map.iteritems()} # train_feats = feats[[feat_id_map[id] for id in train_ids]] # test_feats = feats[[feat_id_map[id] for id in test_ids]] # numNode = len(feat_id_map) # adj = sp.lil_matrix(np.zeros((numNode,numNode))) # for edge in G.edges(): # adj[feat_id_map[edge[0]], feat_id_map[edge[1]]] = 1 train_index = [feat_id_map[id] for id in train_ids] val_index = [feat_id_map[id] for id in val_ids] test_index = [feat_id_map[id] for id in test_ids] np.savez(output_file, feats = feats, y_train=train_labels, y_val=val_labels, y_test = test_labels, train_index = train_index, val_index=val_index, test_index = test_index) def transferLabel2Onehot(labels, N): y = np.zeros((len(labels),N)) for i in range(len(labels)): pos = labels[i] y[i,pos] =1 return y def construct_feeddict_forMixlayers(AXfeatures, support, labels, placeholders): feed_dict = dict() feed_dict.update({placeholders['labels']: labels}) feed_dict.update({placeholders['AXfeatures']: AXfeatures}) feed_dict.update({placeholders['support']: support}) feed_dict.update({placeholders['num_features_nonzero']: AXfeatures[1].shape}) return feed_dict def main(rank1): # config = tf.ConfigProto(device_count={"CPU": 4}, # limit to num_cpu_core CPU usage # inter_op_parallelism_threads = 1, # intra_op_parallelism_threads = 4, # log_device_placement=False) adj, features, y_train, y_val, y_test,train_index, val_index, test_index = loadRedditFromNPZ("data/") adj = adj+adj.T y_train = transferLabel2Onehot(y_train, 41) y_val = transferLabel2Onehot(y_val, 41) y_test = transferLabel2Onehot(y_test, 41) features = sp.lil_matrix(features) adj_train = adj[train_index, :][:, train_index] numNode_train = adj_train.shape[0] # print("numNode", numNode) if FLAGS.model == 'gcn_mix': normADJ_train = nontuple_preprocess_adj(adj_train) normADJ = nontuple_preprocess_adj(adj) # normADJ_val = nontuple_preprocess_adj(adj_val) # normADJ_test = nontuple_preprocess_adj(adj_test) num_supports = 2 model_func = GCN_APPRO_Mix else: raise ValueError('Invalid argument for model: ' + str(FLAGS.model)) # Some preprocessing features = nontuple_preprocess_features(features).todense() train_features = normADJ_train.dot(features[train_index]) features = normADJ.dot(features) nonzero_feature_number = len(np.nonzero(features)[0]) nonzero_feature_number_train = len(np.nonzero(train_features)[0]) # Define placeholders placeholders = { 'support': tf.sparse_placeholder(tf.float32) , 'AXfeatures': tf.placeholder(tf.float32, shape=(None, features.shape[1])), 'labels': tf.placeholder(tf.float32, shape=(None, y_train.shape[1])), 'dropout': tf.placeholder_with_default(0., shape=()), 'num_features_nonzero': tf.placeholder(tf.int32) # helper variable for sparse dropout } # Create model model = model_func(placeholders, input_dim=features.shape[-1], logging=True) # Initialize session sess = tf.Session() # Define model evaluation function def evaluate(features, support, labels, placeholders): t_test = time.time() feed_dict_val = construct_feeddict_forMixlayers(features, support, labels, placeholders) outs_val = sess.run([model.loss, model.accuracy], feed_dict=feed_dict_val) return outs_val[0], outs_val[1], (time.time() - t_test) # Init variables sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() cost_val = [] p0 = column_prop(normADJ_train) # testSupport = [sparse_to_tuple(normADJ), sparse_to_tuple(normADJ)] valSupport = sparse_to_tuple(normADJ[val_index, :]) testSupport = sparse_to_tuple(normADJ[test_index, :]) t = time.time() maxACC = 0.0 # Train model for epoch in range(FLAGS.epochs): t1 = time.time() n = 0 for batch in iterate_minibatches_listinputs([normADJ_train, y_train], batchsize=256, shuffle=True): [normADJ_batch, y_train_batch] = batch p1 = column_prop(normADJ_batch) if rank1 is None: support1 = sparse_to_tuple(normADJ_batch) features_inputs = train_features else: q1 = np.random.choice(np.arange(numNode_train), rank1, replace=False, p=p1) # top layer support1 = sparse_to_tuple(normADJ_batch[:, q1].dot(sp.diags(1.0 / (p1[q1] * rank1)))) features_inputs = train_features[q1, :] # selected nodes for approximation # Construct feed dictionary feed_dict = construct_feeddict_forMixlayers(features_inputs, support1, y_train_batch, placeholders) feed_dict.update({placeholders['dropout']: FLAGS.dropout}) # Training step outs = sess.run([model.opt_op, model.loss, model.accuracy], feed_dict=feed_dict) n = n +1 # Validation cost, acc, duration = evaluate(features, valSupport, y_val, placeholders) cost_val.append(cost) if epoch > 50 and acc>maxACC: maxACC = acc save_path = saver.save(sess, "tmp/tmp_MixModel.ckpt") # Print results print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(outs[1]), "train_acc=", "{:.5f}".format(outs[2]), "val_loss=", "{:.5f}".format(cost), "val_acc=", "{:.5f}".format(acc), "time per batch=", "{:.5f}".format((time.time() - t1)/n)) if epoch > FLAGS.early_stopping and np.mean(cost_val[-2:]) > np.mean(cost_val[-(FLAGS.early_stopping + 1):-1]): # print("Early stopping...") break train_duration = time.time() - t # Testing if os.path.exists("tmp/tmp_MixModel.ckpt"): saver.restore(sess, "tmp/tmp_MixModel.ckpt") test_cost, test_acc, test_duration = evaluate(features, testSupport, y_test, placeholders) print("rank1 = {}".format(rank1), "cost=", "{:.5f}".format(test_cost), "accuracy=", "{:.5f}".format(test_acc), "training time=", "{:.5f}".format(train_duration), "epoch = {}".format(epoch+1), "test time=", "{:.5f}".format(test_duration)) if __name__=="__main__": # main(100) for k in [25, 50]: main(k) ================================================ FILE: train_batch_multiRank_inductive_reddit_Mixlayers_uniform.py ================================================ from __future__ import division from __future__ import print_function import time import tensorflow as tf import scipy.sparse as sp from utils import * from models import GCN, MLP, GCN_APPRO_Mix import json from networkx.readwrite import json_graph import os # Set random seed seed = 123 np.random.seed(seed) tf.set_random_seed(seed) # Settings flags = tf.app.flags FLAGS = flags.FLAGS flags.DEFINE_string('dataset', 'pubmed', 'Dataset string.') # 'cora', 'citeseer', 'pubmed' flags.DEFINE_string('model', 'gcn_mix', 'Model string.') # 'gcn', 'gcn_appr' flags.DEFINE_float('learning_rate', 0.001, 'Initial learning rate.') flags.DEFINE_integer('epochs', 200, 'Number of epochs to train.') flags.DEFINE_integer('hidden1', 128, 'Number of units in hidden layer 1.') flags.DEFINE_float('dropout', 0.0, 'Dropout rate (1 - keep probability).') flags.DEFINE_float('weight_decay', 1e-4, 'Weight for L2 loss on embedding matrix.') flags.DEFINE_integer('early_stopping', 100, 'Tolerance for early stopping (# of epochs).') flags.DEFINE_integer('max_degree', 3, 'Maximum Chebyshev polynomial degree.') # Load data def iterate_minibatches_listinputs(inputs, batchsize, shuffle=False): assert inputs is not None numSamples = inputs[0].shape[0] if shuffle: indices = np.arange(numSamples) np.random.shuffle(indices) for start_idx in range(0, numSamples - batchsize + 1, batchsize): if shuffle: excerpt = indices[start_idx:start_idx + batchsize] else: excerpt = slice(start_idx, start_idx + batchsize) yield [input[excerpt] for input in inputs] def loadRedditFromG(dataset_dir, inputfile): f= open(dataset_dir+inputfile) objects = [] for _ in range(pkl.load(f)): objects.append(pkl.load(f)) adj, train_labels, val_labels, test_labels, train_index, val_index, test_index = tuple(objects) feats = np.load(dataset_dir + "/reddit-feats.npy") return sp.csr_matrix(adj), sp.lil_matrix(feats), train_labels, val_labels, test_labels, train_index, val_index, test_index def loadRedditFromNPZ(dataset_dir): adj = sp.load_npz(dataset_dir+"reddit_adj.npz") data = np.load(dataset_dir+"reddit.npz") return adj, data['feats'], data['y_train'], data['y_val'], data['y_test'], data['train_index'], data['val_index'], data['test_index'] def transferRedditDataFormat(dataset_dir, output_file): G = json_graph.node_link_graph(json.load(open(dataset_dir + "/reddit-G.json"))) labels = json.load(open(dataset_dir + "/reddit-class_map.json")) train_ids = [n for n in G.nodes() if not G.node[n]['val'] and not G.node[n]['test']] test_ids = [n for n in G.nodes() if G.node[n]['test']] val_ids = [n for n in G.nodes() if G.node[n]['val']] train_labels = [labels[i] for i in train_ids] test_labels = [labels[i] for i in test_ids] val_labels = [labels[i] for i in val_ids] feats = np.load(dataset_dir + "/reddit-feats.npy") ## Logistic gets thrown off by big counts, so log transform num comments and score feats[:, 0] = np.log(feats[:, 0] + 1.0) feats[:, 1] = np.log(feats[:, 1] - min(np.min(feats[:, 1]), -1)) feat_id_map = json.load(open(dataset_dir + "reddit-id_map.json")) feat_id_map = {id: val for id, val in feat_id_map.iteritems()} # train_feats = feats[[feat_id_map[id] for id in train_ids]] # test_feats = feats[[feat_id_map[id] for id in test_ids]] # numNode = len(feat_id_map) # adj = sp.lil_matrix(np.zeros((numNode,numNode))) # for edge in G.edges(): # adj[feat_id_map[edge[0]], feat_id_map[edge[1]]] = 1 train_index = [feat_id_map[id] for id in train_ids] val_index = [feat_id_map[id] for id in val_ids] test_index = [feat_id_map[id] for id in test_ids] np.savez(output_file, feats = feats, y_train=train_labels, y_val=val_labels, y_test = test_labels, train_index = train_index, val_index=val_index, test_index = test_index) def transferLabel2Onehot(labels, N): y = np.zeros((len(labels),N)) for i in range(len(labels)): pos = labels[i] y[i,pos] =1 return y def construct_feeddict_forMixlayers(AXfeatures, support, labels, placeholders): feed_dict = dict() feed_dict.update({placeholders['labels']: labels}) feed_dict.update({placeholders['AXfeatures']: AXfeatures}) feed_dict.update({placeholders['support']: support}) feed_dict.update({placeholders['num_features_nonzero']: AXfeatures[1].shape}) return feed_dict def main(rank1): # config = tf.ConfigProto(device_count={"CPU": 4}, # limit to num_cpu_core CPU usage # inter_op_parallelism_threads = 1, # intra_op_parallelism_threads = 4, # log_device_placement=False) adj, features, y_train, y_val, y_test,train_index, val_index, test_index = loadRedditFromNPZ("data/") adj = adj+adj.T y_train = transferLabel2Onehot(y_train, 41) y_val = transferLabel2Onehot(y_val, 41) y_test = transferLabel2Onehot(y_test, 41) features = sp.lil_matrix(features) adj_train = adj[train_index, :][:, train_index] numNode_train = adj_train.shape[0] # print("numNode", numNode) if FLAGS.model == 'gcn_mix': normADJ_train = nontuple_preprocess_adj(adj_train) normADJ = nontuple_preprocess_adj(adj) # normADJ_val = nontuple_preprocess_adj(adj_val) # normADJ_test = nontuple_preprocess_adj(adj_test) model_func = GCN_APPRO_Mix else: raise ValueError('Invalid argument for model: ' + str(FLAGS.model)) # Some preprocessing features = nontuple_preprocess_features(features).todense() train_features = normADJ_train.dot(features[train_index]) features = normADJ.dot(features) nonzero_feature_number = len(np.nonzero(features)[0]) nonzero_feature_number_train = len(np.nonzero(train_features)[0]) # Define placeholders placeholders = { 'support': tf.sparse_placeholder(tf.float32) , 'AXfeatures': tf.placeholder(tf.float32, shape=(None, features.shape[1])), 'labels': tf.placeholder(tf.float32, shape=(None, y_train.shape[1])), 'dropout': tf.placeholder_with_default(0., shape=()), 'num_features_nonzero': tf.placeholder(tf.int32) # helper variable for sparse dropout } # Create model model = model_func(placeholders, input_dim=features.shape[-1], logging=True) # Initialize session sess = tf.Session() saver = tf.train.Saver() # Define model evaluation function def evaluate(features, support, labels, placeholders): t_test = time.time() feed_dict_val = construct_feeddict_forMixlayers(features, support, labels, placeholders) outs_val = sess.run([model.loss, model.accuracy], feed_dict=feed_dict_val) return outs_val[0], outs_val[1], (time.time() - t_test) # Init variables sess.run(tf.global_variables_initializer()) cost_val = [] # testSupport = [sparse_to_tuple(normADJ), sparse_to_tuple(normADJ)] valSupport = sparse_to_tuple(normADJ[val_index, :]) testSupport = sparse_to_tuple(normADJ[test_index, :]) t = time.time() maxACC = 0.0 # Train model for epoch in range(FLAGS.epochs): t1 = time.time() n = 0 for batch in iterate_minibatches_listinputs([normADJ_train, y_train], batchsize=256, shuffle=True): [normADJ_batch, y_train_batch] = batch if rank1 is None: support1 = sparse_to_tuple(normADJ_batch) features_inputs = train_features else: distr = np.nonzero(np.sum(normADJ_batch, axis=0))[1] if rank1 > len(distr): q1 = distr else: q1 = np.random.choice(distr, rank1,replace=False) # top layer # q1 = np.random.choice(np.arange(numNode_train), rank1) # top layer support1 = sparse_to_tuple(normADJ_batch[:, q1]*numNode_train/len(q1)) features_inputs = train_features[q1, :] # selected nodes for approximation # Construct feed dictionary feed_dict = construct_feeddict_forMixlayers(features_inputs, support1, y_train_batch, placeholders) feed_dict.update({placeholders['dropout']: FLAGS.dropout}) # Training step outs = sess.run([model.opt_op, model.loss, model.accuracy], feed_dict=feed_dict) n = n+1 # Validation cost, acc, duration = evaluate(features, valSupport, y_val, placeholders) cost_val.append(cost) if epoch > 50 and acc > maxACC: maxACC = acc save_path = saver.save(sess, "tmp/tmp_MixModel_uniform.ckpt") # Print results print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(outs[1]), "train_acc=", "{:.5f}".format(outs[2]), "val_loss=", "{:.5f}".format(cost), "val_acc=", "{:.5f}".format(acc), "time per batch=", "{:.5f}".format((time.time() - t1)/n)) if epoch > FLAGS.early_stopping and np.mean(cost_val[-2:]) > np.mean(cost_val[-(FLAGS.early_stopping + 1):-1]): # print("Early stopping...") break train_duration = time.time() - t # Testing if os.path.exists("tmp/tmp_MixModel_uniform.ckpt"): saver.restore(sess, "tmp/tmp_MixModel_uniform.ckpt") test_cost, test_acc, test_duration = evaluate(features, testSupport, y_test, placeholders) print("rank1 = {}".format(rank1), "cost=", "{:.5f}".format(test_cost), "accuracy=", "{:.5f}".format(test_acc), "training time=", "{:.5f}".format(train_duration), "epoch = {}".format(epoch + 1), "test time=", "{:.5f}".format(test_duration)) def transferG2ADJ(): G = json_graph.node_link_graph(json.load(open("reddit/reddit-G.json"))) feat_id_map = json.load(open("reddit/reddit-id_map.json")) feat_id_map = {id: val for id, val in feat_id_map.iteritems()} numNode = len(feat_id_map) adj = np.zeros((numNode, numNode)) newEdges0 = [feat_id_map[edge[0]] for edge in G.edges()] newEdges1 = [feat_id_map[edge[1]] for edge in G.edges()] # for edge in G.edges(): # adj[feat_id_map[edge[0]], feat_id_map[edge[1]]] = 1 adj = sp.csr_matrix((np.ones((len(newEdges0),)), (newEdges0, newEdges1)), shape=(numNode, numNode)) sp.save_npz("reddit_adj.npz", adj) if __name__=="__main__": main(50) # for k in [25, 50, 100, 200, 400]: # main(k) ================================================ FILE: train_batch_multiRank_inductive_reddit_appr2layers.py ================================================ from __future__ import division from __future__ import print_function import time import tensorflow as tf import scipy.sparse as sp from utils import * from models import GCN, MLP, GCN_APPRO import json from networkx.readwrite import json_graph import os # Set random seed seed = 123 np.random.seed(seed) tf.set_random_seed(seed) # Settings flags = tf.app.flags FLAGS = flags.FLAGS flags.DEFINE_string('dataset', 'pubmed', 'Dataset string.') # 'cora', 'citeseer', 'pubmed' flags.DEFINE_string('model', 'gcn_appr', 'Model string.') # 'gcn', 'gcn_appr' flags.DEFINE_float('learning_rate', 0.001, 'Initial learning rate.') flags.DEFINE_integer('epochs', 12, 'Number of epochs to train.') flags.DEFINE_integer('hidden1', 128, 'Number of units in hidden layer 1.') flags.DEFINE_float('dropout', 0.0, 'Dropout rate (1 - keep probability).') flags.DEFINE_float('weight_decay', 1e-4, 'Weight for L2 loss on embedding matrix.') flags.DEFINE_integer('early_stopping', 100, 'Tolerance for early stopping (# of epochs).') flags.DEFINE_integer('max_degree', 3, 'Maximum Chebyshev polynomial degree.') # Load data def iterate_minibatches_listinputs(inputs, batchsize, shuffle=False): assert inputs is not None numSamples = inputs[0].shape[0] if shuffle: indices = np.arange(numSamples) np.random.shuffle(indices) for start_idx in range(0, numSamples - batchsize + 1, batchsize): if shuffle: excerpt = indices[start_idx:start_idx + batchsize] else: excerpt = slice(start_idx, start_idx + batchsize) yield [input[excerpt] for input in inputs] def loadRedditFromG(dataset_dir, inputfile): f= open(dataset_dir+inputfile) objects = [] for _ in range(pkl.load(f)): objects.append(pkl.load(f)) adj, train_labels, val_labels, test_labels, train_index, val_index, test_index = tuple(objects) feats = np.load(dataset_dir + "/reddit-feats.npy") return sp.csr_matrix(adj), sp.lil_matrix(feats), train_labels, val_labels, test_labels, train_index, val_index, test_index def loadRedditFromNPZ(dataset_dir): adj = sp.load_npz(dataset_dir+"reddit_adj.npz") data = np.load(dataset_dir+"reddit.npz") return adj, data['feats'], data['y_train'], data['y_val'], data['y_test'], data['train_index'], data['val_index'], data['test_index'] def transferRedditDataFormat(dataset_dir, output_file): G = json_graph.node_link_graph(json.load(open(dataset_dir + "/reddit-G.json"))) labels = json.load(open(dataset_dir + "/reddit-class_map.json")) train_ids = [n for n in G.nodes() if not G.node[n]['val'] and not G.node[n]['test']] test_ids = [n for n in G.nodes() if G.node[n]['test']] val_ids = [n for n in G.nodes() if G.node[n]['val']] train_labels = [labels[i] for i in train_ids] test_labels = [labels[i] for i in test_ids] val_labels = [labels[i] for i in val_ids] feats = np.load(dataset_dir + "/reddit-feats.npy") ## Logistic gets thrown off by big counts, so log transform num comments and score feats[:, 0] = np.log(feats[:, 0] + 1.0) feats[:, 1] = np.log(feats[:, 1] - min(np.min(feats[:, 1]), -1)) feat_id_map = json.load(open(dataset_dir + "reddit-id_map.json")) feat_id_map = {id: val for id, val in feat_id_map.iteritems()} # train_feats = feats[[feat_id_map[id] for id in train_ids]] # test_feats = feats[[feat_id_map[id] for id in test_ids]] # numNode = len(feat_id_map) # adj = sp.lil_matrix(np.zeros((numNode,numNode))) # for edge in G.edges(): # adj[feat_id_map[edge[0]], feat_id_map[edge[1]]] = 1 train_index = [feat_id_map[id] for id in train_ids] val_index = [feat_id_map[id] for id in val_ids] test_index = [feat_id_map[id] for id in test_ids] np.savez(output_file, feats = feats, y_train=train_labels, y_val=val_labels, y_test = test_labels, train_index = train_index, val_index=val_index, test_index = test_index) def transferLabel2Onehot(labels, N): y = np.zeros((len(labels),N)) for i in range(len(labels)): pos = labels[i] y[i,pos] =1 return y def main(rank1, rank0): # config = tf.ConfigProto(device_count={"CPU": 4}, # limit to num_cpu_core CPU usage # inter_op_parallelism_threads = 1, # intra_op_parallelism_threads = 4, # log_device_placement=False) adj, features, y_train, y_val, y_test,train_index, val_index, test_index = loadRedditFromNPZ("data/") adj = adj+adj.T y_train = transferLabel2Onehot(y_train, 41) y_val = transferLabel2Onehot(y_val, 41) y_test = transferLabel2Onehot(y_test, 41) features = sp.lil_matrix(features) adj_train = adj[train_index, :][:, train_index] numNode_train = adj_train.shape[0] train_mask = np.ones((numNode_train,)) val_mask = np.ones((y_val.shape[0],)) test_mask = np.ones((y_test.shape[0],)) # print("numNode", numNode) # Some preprocessing features = nontuple_preprocess_features(features).todense() train_features = features[train_index] if FLAGS.model == 'gcn_appr': normADJ_train = nontuple_preprocess_adj(adj_train) normADJ = nontuple_preprocess_adj(adj) # normADJ_val = nontuple_preprocess_adj(adj_val) # normADJ_test = nontuple_preprocess_adj(adj_test) num_supports = 2 model_func = GCN_APPRO else: raise ValueError('Invalid argument for model: ' + str(FLAGS.model)) # Define placeholders placeholders = { 'support': [tf.sparse_placeholder(tf.float32) for _ in range(num_supports)], 'features': tf.placeholder(tf.float32, shape=(None, features.shape[1])), 'labels': tf.placeholder(tf.float32, shape=(None, y_train.shape[1])), 'labels_mask': tf.placeholder(tf.int32), 'dropout': tf.placeholder_with_default(0., shape=()), 'num_features_nonzero': tf.placeholder(tf.int32) # helper variable for sparse dropout } # Create model model = model_func(placeholders, input_dim=features.shape[-1], logging=True) # Initialize session sess = tf.Session() # Define model evaluation function def evaluate(features, support, labels, mask, placeholders): t_test = time.time() feed_dict_val = construct_feed_dict(features, support, labels, mask, placeholders) outs_val = sess.run([model.loss, model.accuracy], feed_dict=feed_dict_val) return outs_val[0], outs_val[1], (time.time() - t_test) # Init variables sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() cost_val = [] p0 = column_prop(normADJ_train) # testSupport = [sparse_to_tuple(normADJ), sparse_to_tuple(normADJ)] valSupport = [sparse_to_tuple(normADJ), sparse_to_tuple(normADJ[val_index, :])] testSupport = [sparse_to_tuple(normADJ), sparse_to_tuple(normADJ[test_index, :])] t = time.time() maxACC = 0.0 # Train model for epoch in range(FLAGS.epochs): t1 = time.time() n = 0 for batch in iterate_minibatches_listinputs([normADJ_train, y_train, train_mask], batchsize=256, shuffle=True): [normADJ_batch, y_train_batch, train_mask_batch] = batch if sum(train_mask_batch) < 1: continue p1 = column_prop(normADJ_batch) q1 = np.random.choice(np.arange(numNode_train), rank1, replace=False, p=p1) # top layer # q0 = np.random.choice(np.arange(numNode_train), rank0, p=p0) # bottom layer support1 = sparse_to_tuple(normADJ_batch[:, q1].dot(sp.diags(1.0 / (p1[q1] * rank1)))) p2 = column_prop(normADJ_train[q1, :]) q0 = np.random.choice(np.arange(numNode_train), rank0, replace=False, p=p2) support0 = sparse_to_tuple(normADJ_train[q1, :][:, q0]) features_inputs = np.diag(1.0 / (p2[q0] * rank0)).dot(train_features[q0, :]) # selected nodes for approximation # distr = np.nonzero(np.sum(normADJ_batch, axis=0))[1] # if rank1 > len(distr): # q1 = distr # else: # q1 = np.random.choice(distr, rank1, replace=False) # top layer # distr0 = np.nonzero(np.sum(normADJ_train[q1,:], axis=0))[1] # if rank0 > len(distr0): # q0 = distr0 # else: # q0 = np.random.choice(distr0, rank0, replace=False) # top layer # # support1 = sparse_to_tuple(normADJ_batch[:, q1].dot(sp.diags(1.0 / (p0[q1] * rank1)))) # support0 = sparse_to_tuple(normADJ_train[q1, :][:, q0]) # features_inputs = np.diag(1.0 / (p0[q0] * rank0)).dot(train_features[q0, :]) # selected nodes for approximation # Construct feed dictionary feed_dict = construct_feed_dict(features_inputs, [support0, support1], y_train_batch, train_mask_batch, placeholders) feed_dict.update({placeholders['dropout']: FLAGS.dropout}) # Training step outs = sess.run([model.opt_op, model.loss, model.accuracy], feed_dict=feed_dict) n=n+1 # Validation cost, acc, duration = evaluate(features, valSupport, y_val, val_mask, placeholders) cost_val.append(cost) if epoch > 50 and acc>maxACC: maxACC = acc save_path = saver.save(sess, "tmp/tmp_redditModel.ckpt") # Print results print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(outs[1]), "train_acc=", "{:.5f}".format(outs[2]), "val_loss=", "{:.5f}".format(cost), "val_acc=", "{:.5f}".format(acc), "time per batch=", "{:.5f}".format((time.time() - t1)/n)) if epoch > FLAGS.early_stopping and np.mean(cost_val[-2:]) > np.mean(cost_val[-(FLAGS.early_stopping + 1):-1]): # print("Early stopping...") break train_duration = time.time() - t # Testing if os.path.exists("tmp/tmp_redditModel.ckpt"): saver.restore(sess, "tmp/tmp_redditModel.ckpt") test_cost, test_acc, test_duration = evaluate(features, testSupport, y_test, test_mask, placeholders) print("rank1 = {}".format(rank1), "rank0 = {}".format(rank0), "cost=", "{:.5f}".format(test_cost), "accuracy=", "{:.5f}".format(test_acc), "training time=", "{:.5f}".format(train_duration), "epoch = {}".format(epoch + 1), "test time=", "{:.5f}".format(test_duration)) def transferG2ADJ(): G = json_graph.node_link_graph(json.load(open("reddit/reddit-G.json"))) feat_id_map = json.load(open("reddit/reddit-id_map.json")) feat_id_map = {id: val for id, val in feat_id_map.iteritems()} numNode = len(feat_id_map) adj = np.zeros((numNode, numNode)) newEdges0 = [feat_id_map[edge[0]] for edge in G.edges()] newEdges1 = [feat_id_map[edge[1]] for edge in G.edges()] # for edge in G.edges(): # adj[feat_id_map[edge[0]], feat_id_map[edge[1]]] = 1 adj = sp.csr_matrix((np.ones((len(newEdges0),)), (newEdges0, newEdges1)), shape=(numNode, numNode)) sp.save_npz("reddit_adj.npz", adj) if __name__=="__main__": # transferRedditDataFormat("reddit/","data/reddit.npz") main(100,100) # for k in [50, 100, 200, 400]: # main(100, k) ================================================ FILE: train_batch_multiRank_inductive_reddit_onelayer.py ================================================ from __future__ import division from __future__ import print_function import time import tensorflow as tf import scipy.sparse as sp from utils import * from models import GCN, MLP, GCN_APPRO_Onelayer import json from networkx.readwrite import json_graph # Set random seed seed = 123 np.random.seed(seed) tf.set_random_seed(seed) # Settings flags = tf.app.flags FLAGS = flags.FLAGS flags.DEFINE_string('dataset', 'pubmed', 'Dataset string.') # 'cora', 'citeseer', 'pubmed' flags.DEFINE_string('model', 'gcn_appr', 'Model string.') # 'gcn', 'gcn_appr' flags.DEFINE_float('learning_rate', 0.01, 'Initial learning rate.') flags.DEFINE_integer('epochs', 300, 'Number of epochs to train.') flags.DEFINE_integer('hidden1', 64, 'Number of units in hidden layer 1.') flags.DEFINE_float('dropout', 0.1, 'Dropout rate (1 - keep probability).') flags.DEFINE_float('weight_decay', 1e-4, 'Weight for L2 loss on embedding matrix.') flags.DEFINE_integer('early_stopping', 30, 'Tolerance for early stopping (# of epochs).') flags.DEFINE_integer('max_degree', 3, 'Maximum Chebyshev polynomial degree.') rank1 = 300 rank0 = 300 # Load data def iterate_minibatches_listinputs(inputs, batchsize, shuffle=False): assert inputs is not None numSamples = inputs[0].shape[0] if shuffle: indices = np.arange(numSamples) np.random.shuffle(indices) for start_idx in range(0, numSamples - batchsize + 1, batchsize): if shuffle: excerpt = indices[start_idx:start_idx + batchsize] else: excerpt = slice(start_idx, start_idx + batchsize) yield [input[excerpt] for input in inputs] def loadRedditFromG(dataset_dir, inputfile): f= open(dataset_dir+inputfile) objects = [] for _ in range(pkl.load(f)): objects.append(pkl.load(f)) adj, train_labels, val_labels, test_labels, train_index, val_index, test_index = tuple(objects) feats = np.load(dataset_dir + "/reddit-feats.npy") return sp.csr_matrix(adj), sp.lil_matrix(feats), train_labels, val_labels, test_labels, train_index, val_index, test_index def loadRedditFromNPZ(dataset_dir): adj = sp.load_npz(dataset_dir+"reddit_adj.npz") data = np.load(dataset_dir+"reddit.npz") return adj, data['feats'], data['y_train'], data['y_val'], data['y_test'], data['train_index'], data['val_index'], data['test_index'] def transferRedditDataFormat(dataset_dir, output_file): G = json_graph.node_link_graph(json.load(open(dataset_dir + "/reddit-G.json"))) labels = json.load(open(dataset_dir + "/reddit-class_map.json")) train_ids = [n for n in G.nodes() if not G.node[n]['val'] and not G.node[n]['test']] test_ids = [n for n in G.nodes() if G.node[n]['test']] val_ids = [n for n in G.nodes() if G.node[n]['val']] train_labels = [labels[i] for i in train_ids] test_labels = [labels[i] for i in test_ids] val_labels = [labels[i] for i in val_ids] feats = np.load(dataset_dir + "/reddit-feats.npy") ## Logistic gets thrown off by big counts, so log transform num comments and score feats[:, 0] = np.log(feats[:, 0] + 1.0) feats[:, 1] = np.log(feats[:, 1] - min(np.min(feats[:, 1]), -1)) feat_id_map = json.load(open(dataset_dir + "reddit-id_map.json")) feat_id_map = {id: val for id, val in feat_id_map.iteritems()} # train_feats = feats[[feat_id_map[id] for id in train_ids]] # test_feats = feats[[feat_id_map[id] for id in test_ids]] # numNode = len(feat_id_map) # adj = sp.lil_matrix(np.zeros((numNode,numNode))) # for edge in G.edges(): # adj[feat_id_map[edge[0]], feat_id_map[edge[1]]] = 1 train_index = [feat_id_map[id] for id in train_ids] val_index = [feat_id_map[id] for id in val_ids] test_index = [feat_id_map[id] for id in test_ids] np.savez(output_file, feats = feats, y_train=train_labels, y_val=val_labels, y_test = test_labels, train_index = train_index, val_index=val_index, test_index = test_index) def transferLabel2Onehot(labels, N): y = np.zeros((len(labels),N)) for i in range(len(labels)): pos = labels[i] y[i,pos] =1 return y def run_regression(train_embeds, train_labels, test_embeds, test_labels): np.random.seed(1) from sklearn.linear_model import SGDClassifier from sklearn.dummy import DummyClassifier from sklearn.metrics import accuracy_score dummy = DummyClassifier() dummy.fit(train_embeds, train_labels) log = SGDClassifier(loss="log", n_jobs=55) log.fit(train_embeds, train_labels) print("Test scores") print(accuracy_score(test_labels, log.predict(test_embeds))) print("Train scores") print(accuracy_score(train_labels, log.predict(train_embeds))) print("Random baseline") print(accuracy_score(test_labels, dummy.predict(test_embeds))) def main(rank1): adj, features, y_train, y_val, y_test,train_index, val_index, test_index = loadRedditFromNPZ("data/") adj = adj+adj.T # train_index = train_index[:10000] # val_index = val_index[:5000] # test_index = test_index[:10000] # y_train = transferLabel2Onehot(y_train, 50)[:10000] # y_val = transferLabel2Onehot(y_val, 50)[:5000] # y_test = transferLabel2Onehot(y_test, 50)[:10000] y_train = transferLabel2Onehot(y_train, 50) y_val = transferLabel2Onehot(y_val, 50) y_test = transferLabel2Onehot(y_test, 50) # adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask = load_data(FLAGS.dataset) features = sp.lil_matrix(features) adj_train = adj[train_index, :][:, train_index] adj_val = adj[val_index, :][:, val_index] adj_test = adj[test_index, :][:, test_index] numNode_train = adj_train.shape[0] train_mask = np.ones((numNode_train,)) val_mask = np.ones((adj_val.shape[0],)) test_mask = np.ones((adj_test.shape[0],)) # print("numNode", numNode) # Some preprocessing features = nontuple_preprocess_features(features) train_features = features[train_index] if FLAGS.model == 'gcn_appr': normADJ_train = nontuple_preprocess_adj(adj_train) normADJ = nontuple_preprocess_adj(adj) # normADJ_val = nontuple_preprocess_adj(adj_val) # normADJ_test = nontuple_preprocess_adj(adj_test) num_supports = 2 model_func = GCN_APPRO_Onelayer else: raise ValueError('Invalid argument for model: ' + str(FLAGS.model)) # Define placeholders placeholders = { 'support': [tf.sparse_placeholder(tf.float32) for _ in range(num_supports)], 'features': tf.sparse_placeholder(tf.float32), 'labels': tf.placeholder(tf.float32, shape=(None, y_train.shape[1])), 'labels_mask': tf.placeholder(tf.int32), 'dropout': tf.placeholder_with_default(0., shape=()), 'num_features_nonzero': tf.placeholder(tf.int32) # helper variable for sparse dropout } # Create model model = model_func(placeholders, input_dim=features.shape[-1], logging=True) # Initialize session sess = tf.Session() # Define model evaluation function def evaluate(features, support, labels, mask, placeholders): t_test = time.time() feed_dict_val = construct_feed_dict(features, support, labels, mask, placeholders) outs_val = sess.run([model.loss, model.accuracy], feed_dict=feed_dict_val) return outs_val[0], outs_val[1], (time.time() - t_test) # Init variables sess.run(tf.global_variables_initializer()) cost_val = [] p0 = column_prop(normADJ_train) # testSupport = [sparse_to_tuple(normADJ), sparse_to_tuple(normADJ)] valSupport = [sparse_to_tuple(normADJ[val_index, :])] testSupport = [sparse_to_tuple(normADJ[test_index, :])] t = time.time() # Train model for epoch in range(FLAGS.epochs): t1 = time.time() n = 0 for batch in iterate_minibatches_listinputs([normADJ_train, y_train, train_mask], batchsize=5120, shuffle=True): [normADJ_batch, y_train_batch, train_mask_batch] = batch if sum(train_mask_batch) < 1: continue p1 = column_prop(normADJ_batch) if rank1 is not None: q1 = np.random.choice(np.arange(numNode_train), rank1, p=p1) # top layer # q0 = np.random.choice(np.arange(numNode_train), rank0, p=p0) # bottom layer support1 = sparse_to_tuple(normADJ_batch[:, q1].dot(sp.diags(1.0 / (p1[q1] * rank1)))) features_inputs = sparse_to_tuple(train_features[q1, :]) # selected nodes for approximation else: support1 = sparse_to_tuple(normADJ_batch) features_inputs = sparse_to_tuple(train_features) # Construct feed dictionary feed_dict = construct_feed_dict(features_inputs, [support1], y_train_batch, train_mask_batch, placeholders) feed_dict.update({placeholders['dropout']: FLAGS.dropout}) # Training step outs = sess.run([model.opt_op, model.loss, model.accuracy], feed_dict=feed_dict) # Validation cost, acc, duration = evaluate(sparse_to_tuple(features), valSupport, y_val, val_mask, placeholders) cost_val.append(cost) # Print results print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(outs[1]), "train_acc=", "{:.5f}".format(outs[2]), "val_loss=", "{:.5f}".format(cost), "val_acc=", "{:.5f}".format(acc), "time=", "{:.5f}".format(time.time() - t1)) if epoch > FLAGS.early_stopping and cost_val[-1] > np.mean(cost_val[-(FLAGS.early_stopping + 1):-1]): # print("Early stopping...") break train_duration = time.time() - t # Testing test_cost, test_acc, test_duration = evaluate(sparse_to_tuple(features), testSupport, y_test, test_mask, placeholders) print("rank1 = {}".format(rank1), "rank0 = {}".format(rank0), "cost=", "{:.5f}".format(test_cost), "accuracy=", "{:.5f}".format(test_acc), "training time=", "{:.5f}".format(train_duration)) def transferG2ADJ(): G = json_graph.node_link_graph(json.load(open("reddit/reddit-G.json"))) feat_id_map = json.load(open("reddit/reddit-id_map.json")) feat_id_map = {id: val for id, val in feat_id_map.iteritems()} numNode = len(feat_id_map) adj = np.zeros((numNode, numNode)) newEdges0 = [feat_id_map[edge[0]] for edge in G.edges()] newEdges1 = [feat_id_map[edge[1]] for edge in G.edges()] # for edge in G.edges(): # adj[feat_id_map[edge[0]], feat_id_map[edge[1]]] = 1 adj = sp.csr_matrix((np.ones((len(newEdges0),)), (newEdges0, newEdges1)), shape=(numNode, numNode)) sp.save_npz("reddit_adj.npz", adj) def original(): adj, features, y_train, y_val, y_test, train_index, val_index, test_index = loadRedditFromNPZ("data/") adj = adj+adj.T normADJ = nontuple_preprocess_adj(adj) features = adj.dot(features) train_feats = features[train_index, :] test_feats = features[test_index, :] from sklearn.preprocessing import StandardScaler scaler = StandardScaler() scaler.fit(train_feats) train_feats = scaler.transform(train_feats) test_feats = scaler.transform(test_feats) run_regression(train_feats, y_train, test_feats, y_test) if __name__=="__main__": # transferRedditDataFormat("reddit/","data/reddit.npz") # original() main(50) ================================================ FILE: transformRedditGraph2NPZ.py ================================================ #### Please first download original Reddit Graph Data: http://snap.stanford.edu/graphsage/reddit.zip #### import json from networkx.readwrite import json_graph import scipy.sparse as sp import numpy as np import pickle as pkl def loadRedditFromG(dataset_dir, inputfile): f= open(dataset_dir+inputfile) objects = [] for _ in range(pkl.load(f)): objects.append(pkl.load(f)) adj, train_labels, val_labels, test_labels, train_index, val_index, test_index = tuple(objects) feats = np.load(dataset_dir + "/reddit-feats.npy") return sp.csr_matrix(adj), sp.lil_matrix(feats), train_labels, val_labels, test_labels, train_index, val_index, test_index def loadRedditFromNPZ(dataset_dir): adj = sp.load_npz(dataset_dir+"reddit_adj.npz") data = np.load(dataset_dir+"reddit.npz") return adj, data['feats'], data['y_train'], data['y_val'], data['y_test'], data['train_index'], data['val_index'], data['test_index'] def transferRedditData2AdjNPZ(dataset_dir): G = json_graph.node_link_graph(json.load(open(dataset_dir + "/reddit-G.json"))) feat_id_map = json.load(open(dataset_dir + "/reddit-id_map.json")) feat_id_map = {id: val for id, val in feat_id_map.iteritems()} numNode = len(feat_id_map) print(numNode) adj = sp.lil_matrix((numNode, numNode)) print("no") for edge in G.edges(): adj[feat_id_map[edge[0]], feat_id_map[edge[1]]] = 1 sp.save_npz("reddit_adj.npz", sp.coo_matrix(adj)) def transferRedditDataFormat(dataset_dir, output_file): G = json_graph.node_link_graph(json.load(open(dataset_dir + "/reddit-G.json"))) labels = json.load(open(dataset_dir + "/reddit-class_map.json")) train_ids = [n for n in G.nodes() if not G.node[n]['val'] and not G.node[n]['test']] test_ids = [n for n in G.nodes() if G.node[n]['test']] val_ids = [n for n in G.nodes() if G.node[n]['val']] train_labels = [labels[i] for i in train_ids] test_labels = [labels[i] for i in test_ids] val_labels = [labels[i] for i in val_ids] feats = np.load(dataset_dir + "/reddit-feats.npy") ## Logistic gets thrown off by big counts, so log transform num comments and score feats[:, 0] = np.log(feats[:, 0] + 1.0) feats[:, 1] = np.log(feats[:, 1] - min(np.min(feats[:, 1]), -1)) feat_id_map = json.load(open(dataset_dir + "reddit-id_map.json")) feat_id_map = {id: val for id, val in feat_id_map.iteritems()} train_index = [feat_id_map[id] for id in train_ids] val_index = [feat_id_map[id] for id in val_ids] test_index = [feat_id_map[id] for id in test_ids] np.savez(output_file, feats=feats, y_train=train_labels, y_val=val_labels, y_test=test_labels, train_index=train_index, val_index=val_index, test_index=test_index) if __name__=="__main__": # transferRedditData2AdjNPZ("reddit") transferRedditDataFormat("reddit","reddit.npz") ================================================ FILE: utils.py ================================================ import numpy as np import pickle as pkl import networkx as nx import scipy.sparse as sp from scipy.sparse.linalg.eigen.arpack import eigsh import sys from scipy.sparse.linalg import norm as sparsenorm from scipy.linalg import qr # from sklearn.metrics import f1_score def parse_index_file(filename): """Parse index file.""" index = [] for line in open(filename): index.append(int(line.strip())) return index def sample_mask(idx, l): """Create mask.""" mask = np.zeros(l) mask[idx] = 1 return np.array(mask, dtype=np.bool) # # def calc_f1(y_true, y_pred): # y_true = np.argmax(y_true, axis=1) # y_pred = np.argmax(y_pred, axis=1) # return f1_score(y_true, y_pred, average="micro"), f1_score(y_true, y_pred, average="macro") # # # def load_data(dataset_str): # """Load data.""" # names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph'] # objects = [] # for i in range(len(names)): # with open("data/ind.{}.{}".format(dataset_str, names[i]), 'rb') as f: # if sys.version_info > (3, 0): # objects.append(pkl.load(f, encoding='latin1')) # else: # objects.append(pkl.load(f)) # # x, y, tx, ty, allx, ally, graph = tuple(objects) # test_idx_reorder = parse_index_file("data/ind.{}.test.index".format(dataset_str)) # test_idx_range = np.sort(test_idx_reorder) # # if dataset_str == 'citeseer': # # Fix citeseer dataset (there are some isolated nodes in the graph) # # Find isolated nodes, add them as zero-vecs into the right position # test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder)+1) # tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1])) # tx_extended[test_idx_range-min(test_idx_range), :] = tx # tx = tx_extended # ty_extended = np.zeros((len(test_idx_range_full), y.shape[1])) # ty_extended[test_idx_range-min(test_idx_range), :] = ty # ty = ty_extended # # features = sp.vstack((allx, tx)).tolil() # features[test_idx_reorder, :] = features[test_idx_range, :] # adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph)) # # labels = np.vstack((ally, ty)) # labels[test_idx_reorder, :] = labels[test_idx_range, :] # # idx_test = test_idx_range.tolist() # idx_train = range(len(y)) # idx_val = range(len(y), len(y)+500) # # train_mask = sample_mask(idx_train, labels.shape[0]) # val_mask = sample_mask(idx_val, labels.shape[0]) # test_mask = sample_mask(idx_test, labels.shape[0]) # # y_train = np.zeros(labels.shape) # y_val = np.zeros(labels.shape) # y_test = np.zeros(labels.shape) # y_train[train_mask, :] = labels[train_mask, :] # y_val[val_mask, :] = labels[val_mask, :] # y_test[test_mask, :] = labels[test_mask, :] # # return adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask # def load_data(dataset_str): """Load data.""" names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph'] objects = [] for i in range(len(names)): with open("data/ind.{}.{}".format(dataset_str, names[i]), 'rb') as f: if sys.version_info > (3, 0): objects.append(pkl.load(f, encoding='latin1')) else: objects.append(pkl.load(f)) x, y, tx, ty, allx, ally, graph = tuple(objects) test_idx_reorder = parse_index_file("data/ind.{}.test.index".format(dataset_str)) test_idx_range = np.sort(test_idx_reorder) if dataset_str == 'citeseer': # Fix citeseer dataset (there are some isolated nodes in the graph) # Find isolated nodes, add them as zero-vecs into the right position test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder)+1) tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1])) tx_extended[test_idx_range-min(test_idx_range), :] = tx tx = tx_extended ty_extended = np.zeros((len(test_idx_range_full), y.shape[1])) ty_extended[test_idx_range-min(test_idx_range), :] = ty ty = ty_extended features = sp.vstack((allx, tx)).tolil() features[test_idx_reorder, :] = features[test_idx_range, :] adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph)) labels = np.vstack((ally, ty)) labels[test_idx_reorder, :] = labels[test_idx_range, :] idx_test = test_idx_range.tolist() idx_train = range(len(ally)-500) idx_val = range(len(ally)-500, len(ally)) train_mask = sample_mask(idx_train, labels.shape[0]) val_mask = sample_mask(idx_val, labels.shape[0]) test_mask = sample_mask(idx_test, labels.shape[0]) y_train = np.zeros(labels.shape) y_val = np.zeros(labels.shape) y_test = np.zeros(labels.shape) y_train[train_mask, :] = labels[train_mask, :] y_val[val_mask, :] = labels[val_mask, :] y_test[test_mask, :] = labels[test_mask, :] return adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask def load_data_original(dataset_str): """Load data.""" names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph'] objects = [] for i in range(len(names)): with open("data/ind.{}.{}".format(dataset_str, names[i]), 'rb') as f: if sys.version_info > (3, 0): objects.append(pkl.load(f, encoding='latin1')) else: objects.append(pkl.load(f)) x, y, tx, ty, allx, ally, graph = tuple(objects) test_idx_reorder = parse_index_file("data/ind.{}.test.index".format(dataset_str)) test_idx_range = np.sort(test_idx_reorder) if dataset_str == 'citeseer': # Fix citeseer dataset (there are some isolated nodes in the graph) # Find isolated nodes, add them as zero-vecs into the right position test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder)+1) tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1])) tx_extended[test_idx_range-min(test_idx_range), :] = tx tx = tx_extended ty_extended = np.zeros((len(test_idx_range_full), y.shape[1])) ty_extended[test_idx_range-min(test_idx_range), :] = ty ty = ty_extended features = sp.vstack((allx, tx)).tolil() features[test_idx_reorder, :] = features[test_idx_range, :] adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph)) labels = np.vstack((ally, ty)) labels[test_idx_reorder, :] = labels[test_idx_range, :] idx_test = test_idx_range.tolist() idx_train = range(len(y)) idx_val = range(len(y), len(y)+500) train_mask = sample_mask(idx_train, labels.shape[0]) val_mask = sample_mask(idx_val, labels.shape[0]) test_mask = sample_mask(idx_test, labels.shape[0]) y_train = np.zeros(labels.shape) y_val = np.zeros(labels.shape) y_test = np.zeros(labels.shape) y_train[train_mask, :] = labels[train_mask, :] y_val[val_mask, :] = labels[val_mask, :] y_test[test_mask, :] = labels[test_mask, :] return adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask def sparse_to_tuple(sparse_mx): """Convert sparse matrix to tuple representation.""" def to_tuple(mx): if not sp.isspmatrix_coo(mx): mx = mx.tocoo() coords = np.vstack((mx.row, mx.col)).transpose() values = mx.data shape = mx.shape return coords, values, shape if isinstance(sparse_mx, list): for i in range(len(sparse_mx)): sparse_mx[i] = to_tuple(sparse_mx[i]) else: sparse_mx = to_tuple(sparse_mx) return sparse_mx def nontuple_preprocess_features(features): """Row-normalize feature matrix and convert to tuple representation""" rowsum = np.array(features.sum(1)) r_inv = np.power(rowsum, -1).flatten() r_inv[np.isinf(r_inv)] = 0. r_mat_inv = sp.diags(r_inv) features = r_mat_inv.dot(features) return features def preprocess_features(features): """Row-normalize feature matrix and convert to tuple representation""" rowsum = np.array(features.sum(1)) r_inv = np.power(rowsum, -1).flatten() r_inv[np.isinf(r_inv)] = 0. r_mat_inv = sp.diags(r_inv) features = r_mat_inv.dot(features) return sparse_to_tuple(features) def normalize_adj(adj): """Symmetrically normalize adjacency matrix.""" adj = sp.coo_matrix(adj) rowsum = np.array(adj.sum(1)) d_inv_sqrt = np.power(rowsum, -0.5).flatten() d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0. d_mat_inv_sqrt = sp.diags(d_inv_sqrt) return adj.dot(d_mat_inv_sqrt).transpose().dot(d_mat_inv_sqrt).tocoo() def nontuple_preprocess_adj(adj): adj_normalized = normalize_adj(sp.eye(adj.shape[0]) + adj) # adj_normalized = sp.eye(adj.shape[0]) + normalize_adj(adj) return adj_normalized.tocsr() def column_prop(adj): column_norm = sparsenorm(adj, axis=0) # column_norm = pow(sparsenorm(adj, axis=0),2) norm_sum = sum(column_norm) return column_norm/norm_sum def mix_prop(adj, features, sparseinputs=False): adj_column_norm = sparsenorm(adj, axis=0) if sparseinputs: features_row_norm = sparsenorm(features, axis=1) else: features_row_norm = np.linalg.norm(features, axis=1) mix_norm = adj_column_norm*features_row_norm norm_sum = sum(mix_norm) return mix_norm / norm_sum def preprocess_adj(adj): """Preprocessing of adjacency matrix for simple GCN model and conversion to tuple representation.""" # adj_appr = np.array(sp.csr_matrix.todense(adj)) # # adj_appr = dense_lanczos(adj_appr, 100) # adj_appr = dense_RandomSVD(adj_appr, 100) # if adj_appr.sum(1).min()<0: # adj_appr = adj_appr- (adj_appr.sum(1).min()-0.5)*sp.eye(adj_appr.shape[0]) # else: # adj_appr = adj_appr + sp.eye(adj_appr.shape[0]) # adj_normalized = normalize_adj(adj_appr) # adj_normalized = normalize_adj(adj+sp.eye(adj.shape[0])) # adj_appr = np.array(sp.coo_matrix.todense(adj_normalized)) # # adj_normalized = dense_RandomSVD(adj_appr,100) # adj_normalized = dense_lanczos(adj_appr, 100) adj_normalized = normalize_adj(sp.eye(adj.shape[0]) + adj) # adj_normalized = sp.eye(adj.shape[0]) + normalize_adj(adj) return sparse_to_tuple(adj_normalized) from lanczos import lanczos def dense_lanczos(A,K): q = np.random.randn(A.shape[0], ) Q, sigma = lanczos(A, K, q) A2 = np.dot(Q[:,:K], np.dot(sigma[:K,:K], Q[:,:K].T)) return sp.csr_matrix(A2) def sparse_lanczos(A,k): q = sp.random(A.shape[0],1) n = A.shape[0] Q = sp.lil_matrix(np.zeros((n,k+1))) A = sp.lil_matrix(A) Q[:,0] = q/sparsenorm(q) alpha = 0 beta = 0 for i in range(k): if i == 0: q = A*Q[:,i] else: q = A*Q[:,i] - beta*Q[:,i-1] alpha = q.T*Q[:,i] q = q - Q[:,i]*alpha q = q - Q[:,:i]*Q[:,:i].T*q # full reorthogonalization beta = sparsenorm(q) Q[:,i+1] = q/beta print(i) Q = Q[:,:k] Sigma = Q.T*A*Q A2 = Q[:,:k]*Sigma[:k,:k]*Q[:,:k].T return A2 # return Q, Sigma def dense_RandomSVD(A,K): G = np.random.randn(A.shape[0],K) B = np.dot(A,G) Q,R =qr(B,mode='economic') M = np.dot(Q, np.dot(Q.T, A)) return sp.csr_matrix(M) def construct_feed_dict(features, support, labels, labels_mask, placeholders): """Construct feed dictionary.""" feed_dict = dict() feed_dict.update({placeholders['labels']: labels}) feed_dict.update({placeholders['labels_mask']: labels_mask}) feed_dict.update({placeholders['features']: features}) feed_dict.update({placeholders['support'][i]: support[i] for i in range(len(support))}) feed_dict.update({placeholders['num_features_nonzero']: features[1].shape}) return feed_dict def chebyshev_polynomials(adj, k): """Calculate Chebyshev polynomials up to order k. Return a list of sparse matrices (tuple representation).""" print("Calculating Chebyshev polynomials up to order {}...".format(k)) adj_normalized = normalize_adj(adj) laplacian = sp.eye(adj.shape[0]) - adj_normalized largest_eigval, _ = eigsh(laplacian, 1, which='LM') scaled_laplacian = (2. / largest_eigval[0]) * laplacian - sp.eye(adj.shape[0]) t_k = list() t_k.append(sp.eye(adj.shape[0])) t_k.append(scaled_laplacian) def chebyshev_recurrence(t_k_minus_one, t_k_minus_two, scaled_lap): s_lap = sp.csr_matrix(scaled_lap, copy=True) return 2 * s_lap.dot(t_k_minus_one) - t_k_minus_two for i in range(2, k+1): t_k.append(chebyshev_recurrence(t_k[-1], t_k[-2], scaled_laplacian)) return sparse_to_tuple(t_k)