Repository: tech-srl/code2seq
Branch: master
Commit: 8ca14173c323
Files: 71
Total size: 312.7 KB
Directory structure:
gitextract_t37_dsto/
├── .gitignore
├── CITATION.cff
├── CSharpExtractor/
│ ├── .gitattributes
│ ├── .gitignore
│ ├── CSharpExtractor/
│ │ ├── .nuget/
│ │ │ └── packages.config
│ │ ├── CSharpExtractor.sln
│ │ └── Extractor/
│ │ ├── Extractor.cs
│ │ ├── Extractor.csproj
│ │ ├── PathFinder.cs
│ │ ├── Program.cs
│ │ ├── Properties/
│ │ │ └── launchSettings.json
│ │ ├── Temp.cs
│ │ ├── Tree/
│ │ │ └── Tree.cs
│ │ ├── Utilities.cs
│ │ └── Variable.cs
│ └── extract.py
├── Input.java
├── JavaExtractor/
│ ├── JPredict/
│ │ ├── .classpath
│ │ ├── .gitignore
│ │ ├── src/
│ │ │ └── main/
│ │ │ └── java/
│ │ │ ├── JavaExtractor/
│ │ │ │ ├── App.java
│ │ │ │ ├── Common/
│ │ │ │ │ ├── CommandLineValues.java
│ │ │ │ │ ├── Common.java
│ │ │ │ │ └── MethodContent.java
│ │ │ │ ├── ExtractFeaturesTask.java
│ │ │ │ ├── FeatureExtractor.java
│ │ │ │ ├── FeaturesEntities/
│ │ │ │ │ ├── ProgramFeatures.java
│ │ │ │ │ ├── ProgramRelation.java
│ │ │ │ │ └── Property.java
│ │ │ │ └── Visitors/
│ │ │ │ ├── FunctionVisitor.java
│ │ │ │ └── LeavesCollectorVisitor.java
│ │ │ └── Test.java
│ │ └── target/
│ │ └── JavaExtractor-0.0.1-SNAPSHOT.jar
│ └── extract.py
├── LICENSE
├── Python150kExtractor/
│ ├── README.md
│ ├── extract.py
│ └── preprocess.sh
├── README.md
├── __init__.py
├── baseline_tokenization/
│ ├── input_example.txt
│ ├── javalang/
│ │ ├── __init__.py
│ │ ├── ast.py
│ │ ├── javadoc.py
│ │ ├── parse.py
│ │ ├── parser.py
│ │ ├── test/
│ │ │ ├── __init__.py
│ │ │ ├── source/
│ │ │ │ └── package-info/
│ │ │ │ ├── AnnotationJavadoc.java
│ │ │ │ ├── AnnotationOnly.java
│ │ │ │ ├── JavadocAnnotation.java
│ │ │ │ ├── JavadocOnly.java
│ │ │ │ └── NoAnnotationNoJavadoc.java
│ │ │ ├── test_java_8_syntax.py
│ │ │ ├── test_javadoc.py
│ │ │ ├── test_package_declaration.py
│ │ │ └── test_util.py
│ │ ├── tokenizer.py
│ │ ├── tree.py
│ │ └── util.py
│ └── subtokenize_nmt_baseline.py
├── code2seq.py
├── common.py
├── config.py
├── extractor.py
├── interactive_predict.py
├── model.py
├── preprocess.py
├── preprocess.sh
├── preprocess_csharp.sh
├── reader.py
├── train.sh
└── train_python150k.sh
================================================
FILE CONTENTS
================================================
================================================
FILE: .gitignore
================================================
*.class
*.lst
.idea/*
*.iml
*.xml
*.pyc
================================================
FILE: CITATION.cff
================================================
@inproceedings{
alon2018codeseq,
title={code2seq: Generating Sequences from Structured Representations of Code},
author={Uri Alon and Shaked Brody and Omer Levy and Eran Yahav},
booktitle={International Conference on Learning Representations},
year={2019},
url={https://openreview.net/forum?id=H1gKYo09tX},
}
================================================
FILE: CSharpExtractor/.gitattributes
================================================
###############################################################################
# Set default behavior to automatically normalize line endings.
###############################################################################
* text=auto
###############################################################################
# Set default behavior for command prompt diff.
#
# This is need for earlier builds of msysgit that does not have it on by
# default for csharp files.
# Note: This is only used by command line
###############################################################################
#*.cs diff=csharp
###############################################################################
# Set the merge driver for project and solution files
#
# Merging from the command prompt will add diff markers to the files if there
# are conflicts (Merging from VS is not affected by the settings below, in VS
# the diff markers are never inserted). Diff markers may cause the following
# file extensions to fail to load in VS. An alternative would be to treat
# these files as binary and thus will always conflict and require user
# intervention with every merge. To do so, just uncomment the entries below
###############################################################################
#*.sln merge=binary
#*.csproj merge=binary
#*.vbproj merge=binary
#*.vcxproj merge=binary
#*.vcproj merge=binary
#*.dbproj merge=binary
#*.fsproj merge=binary
#*.lsproj merge=binary
#*.wixproj merge=binary
#*.modelproj merge=binary
#*.sqlproj merge=binary
#*.wwaproj merge=binary
###############################################################################
# behavior for image files
#
# image files are treated as binary by default.
###############################################################################
#*.jpg binary
#*.png binary
#*.gif binary
###############################################################################
# diff behavior for common document formats
#
# Convert binary document formats to text before diffing them. This feature
# is only available from the command line. Turn it on by uncommenting the
# entries below.
###############################################################################
#*.doc diff=astextplain
#*.DOC diff=astextplain
#*.docx diff=astextplain
#*.DOCX diff=astextplain
#*.dot diff=astextplain
#*.DOT diff=astextplain
#*.pdf diff=astextplain
#*.PDF diff=astextplain
#*.rtf diff=astextplain
#*.RTF diff=astextplain
================================================
FILE: CSharpExtractor/.gitignore
================================================
## Ignore Visual Studio temporary files, build results, and
## files generated by popular Visual Studio add-ons.
# User-specific files
*.suo
*.user
*.userosscache
*.sln.docstates
# User-specific files (MonoDevelop/Xamarin Studio)
*.userprefs
# Build results
[Dd]ebug/
[Dd]ebugPublic/
[Rr]elease/
[Rr]eleases/
x64/
x86/
bld/
[Bb]in/
[Oo]bj/
[Ll]og/
# Visual Studio 2015 cache/options directory
.vs/
# Uncomment if you have tasks that create the project's static files in wwwroot
#wwwroot/
# MSTest test Results
[Tt]est[Rr]esult*/
[Bb]uild[Ll]og.*
# NUNIT
*.VisualState.xml
TestResult.xml
# Build Results of an ATL Project
[Dd]ebugPS/
[Rr]eleasePS/
dlldata.c
# DNX
project.lock.json
artifacts/
*_i.c
*_p.c
*_i.h
*.ilk
*.meta
*.obj
*.pch
*.pdb
*.pgc
*.pgd
*.rsp
*.sbr
*.tlb
*.tli
*.tlh
*.tmp
*.tmp_proj
*.log
*.vspscc
*.vssscc
.builds
*.pidb
*.svclog
*.scc
# Chutzpah Test files
_Chutzpah*
# Visual C++ cache files
ipch/
*.aps
*.ncb
*.opendb
*.opensdf
*.sdf
*.cachefile
*.VC.db
*.VC.VC.opendb
# Visual Studio profiler
*.psess
*.vsp
*.vspx
*.sap
# TFS 2012 Local Workspace
$tf/
# Guidance Automation Toolkit
*.gpState
# ReSharper is a .NET coding add-in
_ReSharper*/
*.[Rr]e[Ss]harper
*.DotSettings.user
# JustCode is a .NET coding add-in
.JustCode
# TeamCity is a build add-in
_TeamCity*
# DotCover is a Code Coverage Tool
*.dotCover
# NCrunch
_NCrunch_*
.*crunch*.local.xml
nCrunchTemp_*
# MightyMoose
*.mm.*
AutoTest.Net/
# Web workbench (sass)
.sass-cache/
# Installshield output folder
[Ee]xpress/
# DocProject is a documentation generator add-in
DocProject/buildhelp/
DocProject/Help/*.HxT
DocProject/Help/*.HxC
DocProject/Help/*.hhc
DocProject/Help/*.hhk
DocProject/Help/*.hhp
DocProject/Help/Html2
DocProject/Help/html
# Click-Once directory
publish/
# Publish Web Output
*.[Pp]ublish.xml
*.azurePubxml
# TODO: Comment the next line if you want to checkin your web deploy settings
# but database connection strings (with potential passwords) will be unencrypted
*.pubxml
*.publishproj
# Microsoft Azure Web App publish settings. Comment the next line if you want to
# checkin your Azure Web App publish settings, but sensitive information contained
# in these scripts will be unencrypted
PublishScripts/
# NuGet Packages
*.nupkg
# The packages folder can be ignored because of Package Restore
**/packages/*
# except build/, which is used as an MSBuild target.
!**/packages/build/
# Uncomment if necessary however generally it will be regenerated when needed
#!**/packages/repositories.config
# NuGet v3's project.json files produces more ignoreable files
*.nuget.props
*.nuget.targets
# Microsoft Azure Build Output
csx/
*.build.csdef
# Microsoft Azure Emulator
ecf/
rcf/
# Windows Store app package directories and files
AppPackages/
BundleArtifacts/
Package.StoreAssociation.xml
_pkginfo.txt
# Visual Studio cache files
# files ending in .cache can be ignored
*.[Cc]ache
# but keep track of directories ending in .cache
!*.[Cc]ache/
# Others
ClientBin/
~$*
*~
*.dbmdl
*.dbproj.schemaview
*.pfx
*.publishsettings
node_modules/
orleans.codegen.cs
# Since there are multiple workflows, uncomment next line to ignore bower_components
# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
#bower_components/
# RIA/Silverlight projects
Generated_Code/
# Backup & report files from converting an old project file
# to a newer Visual Studio version. Backup files are not needed,
# because we have git ;-)
_UpgradeReport_Files/
Backup*/
UpgradeLog*.XML
UpgradeLog*.htm
# SQL Server files
*.mdf
*.ldf
# Business Intelligence projects
*.rdl.data
*.bim.layout
*.bim_*.settings
# Microsoft Fakes
FakesAssemblies/
# GhostDoc plugin setting file
*.GhostDoc.xml
# Node.js Tools for Visual Studio
.ntvs_analysis.dat
# Visual Studio 6 build log
*.plg
# Visual Studio 6 workspace options file
*.opt
# Visual Studio LightSwitch build output
**/*.HTMLClient/GeneratedArtifacts
**/*.DesktopClient/GeneratedArtifacts
**/*.DesktopClient/ModelManifest.xml
**/*.Server/GeneratedArtifacts
**/*.Server/ModelManifest.xml
_Pvt_Extensions
# Paket dependency manager
.paket/paket.exe
paket-files/
# FAKE - F# Make
.fake/
# JetBrains Rider
.idea/
*.sln.iml
# no data
data/*
backupdata/*
================================================
FILE: CSharpExtractor/CSharpExtractor/.nuget/packages.config
================================================
================================================
FILE: CSharpExtractor/CSharpExtractor/CSharpExtractor.sln
================================================
Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 15
VisualStudioVersion = 15.0.28307.136
MinimumVisualStudioVersion = 10.0.40219.1
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Extractor", "Extractor\Extractor.csproj", "{481EDE3F-0ED1-4CB9-814A-63A821022552}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Debug|x64 = Debug|x64
Debug|x86 = Debug|x86
Release|Any CPU = Release|Any CPU
Release|x64 = Release|x64
Release|x86 = Release|x86
Release20|Any CPU = Release20|Any CPU
Release20|x64 = Release20|x64
Release20|x86 = Release20|x86
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{481EDE3F-0ED1-4CB9-814A-63A821022552}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{481EDE3F-0ED1-4CB9-814A-63A821022552}.Debug|Any CPU.Build.0 = Debug|Any CPU
{481EDE3F-0ED1-4CB9-814A-63A821022552}.Debug|x64.ActiveCfg = Debug|Any CPU
{481EDE3F-0ED1-4CB9-814A-63A821022552}.Debug|x64.Build.0 = Debug|Any CPU
{481EDE3F-0ED1-4CB9-814A-63A821022552}.Debug|x86.ActiveCfg = Debug|Any CPU
{481EDE3F-0ED1-4CB9-814A-63A821022552}.Debug|x86.Build.0 = Debug|Any CPU
{481EDE3F-0ED1-4CB9-814A-63A821022552}.Release|Any CPU.ActiveCfg = Release|Any CPU
{481EDE3F-0ED1-4CB9-814A-63A821022552}.Release|Any CPU.Build.0 = Release|Any CPU
{481EDE3F-0ED1-4CB9-814A-63A821022552}.Release|x64.ActiveCfg = Release|Any CPU
{481EDE3F-0ED1-4CB9-814A-63A821022552}.Release|x64.Build.0 = Release|Any CPU
{481EDE3F-0ED1-4CB9-814A-63A821022552}.Release|x86.ActiveCfg = Release|Any CPU
{481EDE3F-0ED1-4CB9-814A-63A821022552}.Release|x86.Build.0 = Release|Any CPU
{481EDE3F-0ED1-4CB9-814A-63A821022552}.Release20|Any CPU.ActiveCfg = Release|Any CPU
{481EDE3F-0ED1-4CB9-814A-63A821022552}.Release20|Any CPU.Build.0 = Release|Any CPU
{481EDE3F-0ED1-4CB9-814A-63A821022552}.Release20|x64.ActiveCfg = Release|Any CPU
{481EDE3F-0ED1-4CB9-814A-63A821022552}.Release20|x64.Build.0 = Release|Any CPU
{481EDE3F-0ED1-4CB9-814A-63A821022552}.Release20|x86.ActiveCfg = Release|Any CPU
{481EDE3F-0ED1-4CB9-814A-63A821022552}.Release20|x86.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {13A0DA89-D5D9-4E75-850E-70B9FBE88FF8}
EndGlobalSection
EndGlobal
================================================
FILE: CSharpExtractor/CSharpExtractor/Extractor/Extractor.cs
================================================
using Extractor.Semantics;
using Microsoft.CodeAnalysis;
using Microsoft.CodeAnalysis.CSharp;
using Microsoft.CodeAnalysis.CSharp.Syntax;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Diagnostics;
namespace Extractor
{
public class Extractor
{
public const string InternalDelimiter = "|";
public const string UpTreeChar = InternalDelimiter;
public const string DownTreeChar = InternalDelimiter;
public const string MethodNameConst = "METHOD_NAME";
public static SyntaxKind[] ParentTypeToAddChildId = new SyntaxKind[] { SyntaxKind.SimpleAssignmentExpression,
SyntaxKind.ElementAccessExpression, SyntaxKind.SimpleMemberAccessExpression, SyntaxKind.InvocationExpression, SyntaxKind.BracketedArgumentList, SyntaxKind.ArgumentList};
private ICollection variables;
public int LengthLimit { get; set; }
public int WidthLimit { get; set; }
public string Code { get; set; }
public bool ShouldHash { get; set; }
public int MaxContexts { get; set; }
public Extractor(string code, Options opts)
{
LengthLimit = opts.MaxLength;
WidthLimit = opts.MaxWidth;
ShouldHash = !opts.NoHash;
MaxContexts = opts.MaxContexts;
Code = code;
}
StringBuilder builder = new StringBuilder();
private string PathNodesToString(PathFinder.Path path)
{
builder.Clear();
var nodeTypes = path.LeftSide;
if (nodeTypes.Count() > 0)
{
builder.Append(nodeTypes.First().Kind());
if (ParentTypeToAddChildId.Contains(nodeTypes.First().Parent.Kind()))
{
builder.Append(GetTruncatedChildId(nodeTypes.First()));
}
foreach (var n in nodeTypes.Skip(1))
{
builder.Append(UpTreeChar).Append(n.Kind());
if (ParentTypeToAddChildId.Contains(n.Parent.Kind()))
{
builder.Append(GetTruncatedChildId(n));
}
}
builder.Append(UpTreeChar);
}
builder.Append(path.Ancesstor.Kind());
nodeTypes = path.RightSide;
if (nodeTypes.Count() > 0)
{
builder.Append(DownTreeChar);
builder.Append(nodeTypes.First().Kind());
if (ParentTypeToAddChildId.Contains(nodeTypes.First().Parent.Kind()))
{
builder.Append(GetTruncatedChildId(nodeTypes.First()));
}
foreach (var n in nodeTypes.Skip(1))
{
builder.Append(DownTreeChar).Append(n.Kind());
if (ParentTypeToAddChildId.Contains(n.Parent.Kind()))
{
builder.Append(GetTruncatedChildId(n));
}
}
}
return builder.ToString();
}
private int GetTruncatedChildId(SyntaxNode n)
{
var parent = n.Parent;
int index = parent.ChildNodes().ToList().IndexOf(n);
if (index > 3)
{
index = 3;
}
return index;
}
private string PathToString(PathFinder.Path path)
{
SyntaxNode ancesstor = path.Ancesstor;
StringBuilder builder = new StringBuilder();
builder.Append(path.Left.Text).Append(UpTreeChar);
builder.Append(this.PathNodesToString(path));
builder.Append(DownTreeChar).Append(path.Right.Text);
return builder.ToString();
}
internal IEnumerable GetInternalPaths(Tree tree)
{
var finder = new PathFinder(tree, LengthLimit, WidthLimit);
var allPairs = Utilities.ReservoirSample(Utilities.WeakConcat(Utilities.Choose2(variables),
variables.Select((arg) => new Tuple(arg, arg))), MaxContexts);
//iterate over variable-variable pairs
foreach (Tuple varPair in allPairs)
{
bool pathToSelf = varPair.Item1 == varPair.Item2;
foreach (var rhs in varPair.Item2.Leaves)
foreach (var lhs in varPair.Item1.Leaves)
{
if (lhs == rhs)
continue;
PathFinder.Path path = finder.FindPath(lhs, rhs, limited: true);
if (path == null)
continue;
yield return path;
}
}
}
private string SplitNameUnlessEmpty(string original)
{
var subtokens = Utilities.SplitToSubtokens(original).Where(s => s.Length > 0);
String name = String.Join(InternalDelimiter, subtokens);
if (name.Length == 0)
{
name = Utilities.NormalizeName(original);
}
if (String.IsNullOrWhiteSpace(name))
{
name = "SPACE";
}
if (String.IsNullOrEmpty(name))
{
name = "BLANK";
}
if (original == Extractor.MethodNameConst)
{
name = original;
}
return name;
}
static readonly char[] removeFromComments = new char[] {' ', '/', '*', '{', '}'};
public List Extract()
{
var tree = new Tree(CSharpSyntaxTree.ParseText(Code).GetRoot());
IEnumerable methods = tree.GetRoot().DescendantNodesAndSelf().OfType().ToList();
List results = new List();
foreach(var method in methods) {
String methodName = method.Identifier.ValueText;
Tree methodTree = new Tree(method);
var subtokensMethodName = Utilities.SplitToSubtokens(methodName);
var tokenToVar = new Dictionary();
this.variables = Variable.CreateFromMethod(methodTree).ToArray();
foreach (var variable in variables)
{
foreach (SyntaxToken token in variable.Leaves)
{
tokenToVar[token] = variable;
}
}
List contexts = new List();
foreach (PathFinder.Path path in GetInternalPaths(methodTree))
{
String pathString = SplitNameUnlessEmpty(tokenToVar[path.Left].Name)
+ "," + MaybeHash(this.PathNodesToString(path))
+ "," + SplitNameUnlessEmpty(tokenToVar[path.Right].Name);
Debug.WriteLine(path.Left.FullSpan+" "+tokenToVar[path.Left].Name+ "," +this.PathNodesToString(path)+ "," + tokenToVar[path.Right].Name+" "+path.Right.FullSpan);
contexts.Add(pathString);
}
var commentNodes = tree.GetRoot().DescendantTrivia().Where(
node => node.IsKind(SyntaxKind.MultiLineCommentTrivia) || node.IsKind(SyntaxKind.SingleLineCommentTrivia) || node.IsKind(SyntaxKind.MultiLineDocumentationCommentTrivia));
foreach (SyntaxTrivia trivia in commentNodes)
{
string commentText = trivia.ToString().Trim(removeFromComments);
string normalizedTrivia = SplitNameUnlessEmpty(commentText);
var parts = normalizedTrivia.Split('|');
for (int i = 0; i < Math.Ceiling((double)parts.Length / (double)5); i++)
{
var batch = String.Join("|", parts.Skip(i * 5).Take(5));
contexts.Add(batch + "," + "COMMENT" + "," + batch);
}
}
results.Add(String.Join("|", subtokensMethodName) + " " + String.Join(" ", contexts));
}
return results;
}
private string MaybeHash(string v)
{
if (this.ShouldHash)
{
return v.GetHashCode().ToString();
} else
{
return v;
}
}
}
}
================================================
FILE: CSharpExtractor/CSharpExtractor/Extractor/Extractor.csproj
================================================
Exe
netcoreapp2.2
Extractor.Program
================================================
FILE: CSharpExtractor/CSharpExtractor/Extractor/PathFinder.cs
================================================
using Microsoft.CodeAnalysis;
using Microsoft.CodeAnalysis.CSharp.Syntax;
using System;
using System.Collections.Generic;
using System.Linq;
namespace Extractor
{
internal class PathFinder
{
internal class Path
{
public SyntaxToken Left { get; }
public List LeftSide { get; }
public SyntaxNode Ancesstor { get; }
public List RightSide { get; }
public SyntaxToken Right { get; }
public Path(SyntaxToken left, IEnumerable leftSide, SyntaxNode ancesstor,
IEnumerable rightSide, SyntaxToken right)
{
this.Left = left;
this.LeftSide = leftSide.ToList();
this.Ancesstor = ancesstor;
this.RightSide = rightSide.ToList();
this.Right = right;
}
}
public int Length { get; }
public int Width { get; }
Tree tree;
public PathFinder(Tree tree, int length = 7, int width = 4)
{
if (length < 1 || width < 1)
throw new ArgumentException("Width and Length params must be positive.");
Length = length;
Width = width;
this.tree = tree;
}
private int GetDepth(SyntaxNode n)
{
int depth = 0;
while(n.Parent != null)
{
n = n.Parent;
depth++;
}
return depth;
}
public SyntaxNode FirstAncestor(SyntaxNode l, SyntaxNode r)
{
if (l.Equals(r))
return l;
if (GetDepth(l) >= GetDepth(r))
{
l = l.Parent;
}
else
{
r = r.Parent;
}
return FirstAncestor(l, r);
}
private IEnumerable CollectPathToParent(SyntaxNode start, SyntaxNode parent)
{
while (!start.Equals(parent))
{
yield return start;
start = start.Parent;
}
}
internal Path FindPath(SyntaxToken l, SyntaxToken r, bool limited = true)
{
SyntaxNode p = FirstAncestor(l.Parent, r.Parent);
// + 2 for the distance of the leafs themselves
if (GetDepth(r.Parent) + GetDepth(l.Parent) - 2 * GetDepth(p) + 2 > Length)
{
return null;
}
var leftSide = CollectPathToParent(l.Parent, p);
var rightSide = CollectPathToParent(r.Parent, p);
rightSide = rightSide.Reverse();
List widthCheck = p.ChildNodes().ToList();
if (limited && leftSide.Count() != 0
&& rightSide.Count() != 0)
{
int indexOfLeft = widthCheck.IndexOf(leftSide.Last());
int indexOfRight = widthCheck.IndexOf(rightSide.First());
if (Math.Abs(indexOfLeft - indexOfRight) >= Width)
{
return null;
}
}
return new Path(l, leftSide, p, rightSide, r);
}
}
}
================================================
FILE: CSharpExtractor/CSharpExtractor/Extractor/Program.cs
================================================
using CommandLine;
using CommandLine.Text;
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
namespace Extractor
{
class Program
{
static List ExtractSingleFile(string filename, Options opts)
{
string data = File.ReadAllText(filename);
var extractor = new Extractor(data, opts);
List result = extractor.Extract();
return result;
}
static void Main(string[] args)
{
Options options = new Options();
Parser.Default.ParseArguments(args)
.WithParsed(opt => options = opt)
.WithNotParsed(errors =>
{
Console.WriteLine(errors);
return;
});
string path = options.Path;
string[] files;
if (Directory.Exists(path))
{
files = Directory.GetFiles(path, "*.cs", SearchOption.AllDirectories);
}
else
{
files = new string[] { path };
}
IEnumerable results = null;
results = files.AsParallel().WithDegreeOfParallelism(options.Threads).SelectMany(filename => ExtractSingleFile(filename, options));
using (StreamWriter sw = new StreamWriter(options.OFileName, append: true))
{
foreach (var res in results)
{
sw.WriteLine(res);
}
}
}
}
}
================================================
FILE: CSharpExtractor/CSharpExtractor/Extractor/Properties/launchSettings.json
================================================
{
"profiles": {
"Extractor": {
"commandName": "Project",
"commandLineArgs": "--path C:\\Users\\urial\\Source\\Repos\\CSharpExtractor\\CSharpExtractor\\Extractor\\bin\\ --no_hash"
}
}
}
================================================
FILE: CSharpExtractor/CSharpExtractor/Extractor/Temp.cs
================================================
namespace Extractor
{
class Temp
{
class NestedClass
{
void fooBar()
{
a.b = c;
}
}
}
}
================================================
FILE: CSharpExtractor/CSharpExtractor/Extractor/Tree/Tree.cs
================================================
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using Microsoft.CodeAnalysis;
using Microsoft.CodeAnalysis.CSharp;
using Microsoft.CodeAnalysis.CSharp.Syntax;
namespace Extractor
{
public class Tree
{
public const string DummyClass = "IgnoreDummyClass";
public const string DummyMethodName = "IgnoreDummyMethod";
public const string DummyType = "IgnoreDummyType";
internal static readonly SyntaxKind[] literals = { SyntaxKind.NumericLiteralToken, SyntaxKind.StringLiteralToken, SyntaxKind.CharacterLiteralToken };
internal static readonly HashSet identifiers = new HashSet(new SyntaxKind[] { SyntaxKind.IdentifierToken }); //, SyntaxKind.VoidKeyword, SyntaxKind.StringKeyword });
internal static readonly HashSet keywords = new HashSet(new SyntaxKind[] { SyntaxKind.RefKeyword, SyntaxKind.OutKeyword, SyntaxKind.ConstKeyword });
internal static readonly HashSet declarations = new HashSet(new SyntaxKind[] { SyntaxKind.VariableDeclarator, SyntaxKind.Parameter, SyntaxKind.CatchDeclaration, SyntaxKind.ForEachStatement });
internal static readonly HashSet memberAccesses = new HashSet(new SyntaxKind[] { SyntaxKind.SimpleMemberAccessExpression, SyntaxKind.PointerMemberAccessExpression });
internal static readonly HashSet scopeEnders = new HashSet(
new SyntaxKind[]{ SyntaxKind.Block, SyntaxKind.ForStatement, SyntaxKind.MethodDeclaration,
SyntaxKind.ForEachStatement, SyntaxKind.CatchClause, SyntaxKind.SwitchSection, SyntaxKind.UsingStatement });
internal static readonly HashSet lambdaScopeStarters = new HashSet(
new SyntaxKind[]{ SyntaxKind.AnonymousMethodExpression,
SyntaxKind.SimpleLambdaExpression, SyntaxKind.ParenthesizedLambdaExpression });
public static bool IsScopeEnder(SyntaxNode node)
{
return Tree.scopeEnders.Contains(node.Kind());
}
class TreeBuilderWalker : CSharpSyntaxWalker
{
Dictionary nodes;
HashSet visitedNodes;
List Desc;
List Tokens;
Dictionary tokens;
internal TreeBuilderWalker(Dictionary nodes, Dictionary tokens)
{
visitedNodes = new HashSet();
this.nodes = nodes;
this.tokens = tokens;
}
public override
void Visit(SyntaxNode node)
{
visitedNodes.Add(node);
base.Visit(node);
visitedNodes.Remove(node);
Desc = new List();
Tokens = new List();
foreach (var c in node.ChildNodes())
{
if (!nodes.ContainsKey(c))
{
continue;
}
Desc.AddRange(nodes[c].Descendents);
Desc.Add(c);
Tokens.AddRange(nodes[c].Leaves);
}
foreach (var token in node.ChildTokens())
{
if (Leaf.IsLeafToken(token))
{
tokens[token] = new Leaf(nodes, token);
Tokens.Add(token);
}
}
Node res = new Node(This: node,
Ancestors: new HashSet(visitedNodes),
Descendents: Desc.ToArray(),
Leaves: Tokens.ToArray(),
Kind: node.Kind());
nodes[node] = res;
}
}
internal SyntaxNode GetRoot()
{
return tree;
}
SyntaxNode tree;
internal Dictionary nodes = new Dictionary();
internal Dictionary leaves = new Dictionary();
public Tree(SyntaxNode syntaxTree)
{
this.tree = syntaxTree;
/*if (this.tree.ChildNodes().ToList().Count() == 0)
{
this.tree = CSharpSyntaxTree.ParseText($"private {DummyType} {DummyMethodName}() {{ {code} }}");
}*/
new TreeBuilderWalker(nodes, leaves).Visit(this.tree);
List commentNodes = tree.DescendantTrivia().Where(
node => node.IsKind(SyntaxKind.MultiLineCommentTrivia) || node.IsKind(SyntaxKind.SingleLineCommentTrivia)).ToList();
}
}
public class Node
{
public Node(SyntaxNode This, HashSet Ancestors, SyntaxNode[] Descendents,
SyntaxToken[] Leaves, SyntaxKind Kind)
{
this.This = This;
this.Ancestors = Ancestors;
this.Descendents = Descendents;
this.AncestorsAndSelf = new HashSet(Ancestors);
this.AncestorsAndSelf.Add(This);
this.Leaves = Leaves;
this.Depth = Depth;
this.Kind = Kind;
this.KindName = Kind.ToString();
}
public SyntaxNode This { get; }
public HashSet Ancestors { get; }
public HashSet AncestorsAndSelf { get; }
public SyntaxNode[] Descendents { get; }
public SyntaxToken[] Leaves { get; }
public SyntaxKind Kind { get; }
public string KindName { get; }
public int Depth { get; }
public override bool Equals(object obj)
{
var item = obj as Node;
if (item == null)
{
return false;
}
return this.This.Equals(item.This);
}
public override int GetHashCode()
{
return this.This.GetHashCode();
}
}
public class Leaf
{
internal static bool IsLeafToken(SyntaxToken token)
{
if (token.Text.Equals("var") && token.IsKind(SyntaxKind.IdentifierToken)
&& token.Parent.IsKind(SyntaxKind.IdentifierName) && token.Parent.Parent.IsKind(SyntaxKind.VariableDeclaration)
&& token.Parent.Parent.Parent.IsKind(SyntaxKind.LocalDeclarationStatement))
{
return false;
}
if (token.ValueText == Tree.DummyMethodName || token.ValueText == Tree.DummyType)
{
return false;
}
return Tree.identifiers.Contains(token.Kind()) || Tree.literals.Contains(token.Kind()) || token.Parent.Kind() == SyntaxKind.PredefinedType;
}
public SyntaxToken token { get; }
public SyntaxKind Kind { get; }
public string KindName { get; }
public string Text { get; set; }
public bool IsConst { get; }
public string VariableName { get; }
public Leaf(Dictionary nodes, SyntaxToken token)
{
this.token = token;
Kind = token.Kind();
KindName = Kind.ToString();
IsConst = !(Tree.identifiers.Contains(Kind) && Tree.declarations.Contains(token.Parent.Kind()));
Text = token.ValueText;
SyntaxNode node = token.Parent.Parent;
SyntaxNode current = token.Parent;
VariableName = Text;
}
}
public class SyntaxViewer
{
private string ToDot(SyntaxTree tree)
{
List nodes = tree.GetRoot().DescendantNodesAndSelf().ToList();
SyntaxToken[] tokens = tree.GetRoot().DescendantTokens().ToArray();
string[] tokenStrings = tokens.Select((arg) => arg.Kind().ToString() + "-" + arg.ToString()).ToArray();
string[] nodeStrings = nodes.Select((arg) => arg.Kind().ToString()).ToArray();
Dictionary counts = new Dictionary();
Dictionary nodeNames = new Dictionary();
IEnumerable allItems = nodeStrings.Concat(tokenStrings);
int i = 0;
foreach (string name in allItems)
{
if (!counts.ContainsKey(name))
counts[name] = 0;
counts[name] += 1;
nodeNames[i] = name + counts[name].ToString();
i++;
}
StringBuilder builder = new StringBuilder();
builder.AppendLine("digraph G {");
// vertexes
for (i = 0; i < allItems.Count(); i++)
{
builder.AppendFormat("\"{0}\" ;\n", nodeNames[i]);
}
builder.AppendLine();
// edges
for (i = 1; i < nodes.Count(); i++)
{
builder.AppendFormat("\"{0}\"->\"{1}\" [];\n", nodeNames[nodes.IndexOf(nodes[i].Parent)], nodeNames[i]);
}
for (i = 0; i < tokens.Count(); i++)
{
builder.AppendFormat("\"{0}\"->\"{1}\" [];\n", nodeNames[nodes.IndexOf(tokens[i].Parent)], nodeNames[i + nodes.Count()]);
}
builder.AppendLine("}");
return builder.ToString();
}
public SyntaxViewer(SyntaxTree tree, string path = "out.ong")
{
string dotData = ToDot(tree);
File.WriteAllText("out.dot", dotData);
}
}
}
================================================
FILE: CSharpExtractor/CSharpExtractor/Extractor/Utilities.cs
================================================
using CommandLine;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Diagnostics;
using System.Text.RegularExpressions;
namespace Extractor
{
public class Options
{
[Option('t', "threads", Default = 1, HelpText = "How many threads to use <1>")]
public int Threads { get; set; }
[Option('p', "path", Default = "./data/", HelpText = "Where to find code files. <.>")]
public string Path { get; set; }
[Option('l', "max_length", Default = 9, HelpText = "Max path length")]
public int MaxLength { get; set; }
[Option('l', "max_width", Default = 2, HelpText = "Max path length")]
public int MaxWidth { get; set; }
[Option('o', "ofile_name", Default = "test.txt", HelpText = "Output file name")]
public String OFileName { get; set; }
[Option('h', "no_hash", Default = true, HelpText = "When enabled, prints the whole path strings (not hashed)")]
public Boolean NoHash { get; set; }
[Option('l', "max_contexts", Default = 30000, HelpText = "Max number of path contexts to sample. Affects only very large snippets")]
public int MaxContexts { get; set; }
}
public static class Utilities
{
public static String[] NumbericLiteralsToKeep = new String[] { "0", "1", "2", "3", "4", "5", "10" };
public static IEnumerable> Choose2(IEnumerable enumerable)
{
int index = 0;
foreach (var e in enumerable)
{
++index;
foreach (var t in enumerable.Skip(index))
yield return Tuple.Create(e, t);
}
}
///
/// Sample uniform randomly numSamples from an enumerable, using reservoir sampling.
/// See https://en.wikipedia.org/wiki/Reservoir_sampling
///
///
///
///
///
public static IEnumerable ReservoirSample(this IEnumerable input, int numSamples)
{
var rng = new Random();
var sampledElements = new List(numSamples);
int seenElementCount = 0;
foreach (var element in input)
{
seenElementCount++;
if (sampledElements.Count < numSamples)
{
sampledElements.Add(element);
}
else
{
int position = rng.Next(seenElementCount);
if (position < numSamples)
{
sampledElements[position] = element;
}
}
}
Debug.Assert(sampledElements.Count <= numSamples);
return sampledElements;
}
public static IEnumerable WeakConcat(IEnumerable enumerable1, IEnumerable enumerable2)
{
foreach (T t in enumerable1)
yield return t;
foreach (T t in enumerable2)
yield return t;
}
public static IEnumerable SplitToSubtokens(String name)
{
return Regex.Split(name.Trim(), "(?<=[a-z])(?=[A-Z])|_|[0-9]|(?<=[A-Z])(?=[A-Z][a-z])|\\s+")
.Where(s => s.Length > 0)
.Select(s => NormalizeName(s))
.Where(s => s.Length > 0);
}
private static Regex Whitespaces = new Regex(@"\s");
private static Regex NonAlphabetic = new Regex("[^A-Za-z]");
public static String NormalizeName(string s)
{
String partiallyNormalized = s.ToLowerInvariant()
.Replace("\\\\n", String.Empty)
.Replace("[\"',]", String.Empty);
partiallyNormalized = Whitespaces.Replace(partiallyNormalized, "");
partiallyNormalized = Encoding.ASCII.GetString(
Encoding.Convert(
Encoding.UTF8,
Encoding.GetEncoding(
Encoding.ASCII.EncodingName,
new EncoderReplacementFallback(string.Empty),
new DecoderExceptionFallback()
),
Encoding.UTF8.GetBytes(partiallyNormalized)
)
);
if (partiallyNormalized.Contains('\n'))
{
partiallyNormalized = partiallyNormalized.Replace('\n', 'N');
}
if (partiallyNormalized.Contains('\r'))
{
partiallyNormalized = partiallyNormalized.Replace('\r', 'R');
}
if (partiallyNormalized.Contains(','))
{
partiallyNormalized = partiallyNormalized.Replace(',', 'C');
}
String completelyNormalized = NonAlphabetic.Replace(partiallyNormalized, String.Empty);
if (completelyNormalized.Length == 0)
{
if (Regex.IsMatch(partiallyNormalized, @"^\d+$"))
{
if (NumbericLiteralsToKeep.Contains(partiallyNormalized))
{
return partiallyNormalized;
}
else
{
return "NUM";
}
}
return String.Empty;
}
return completelyNormalized;
}
}
}
================================================
FILE: CSharpExtractor/CSharpExtractor/Extractor/Variable.cs
================================================
using System;
using System.Collections.Generic;
using System.Linq;
using Microsoft.CodeAnalysis;
using Microsoft.CodeAnalysis.CSharp.Syntax;
namespace Extractor
{
namespace Semantics
{
public class Variable
{
Tree tree;
public string Name { get; }
private HashSet leaves;
public HashSet Leaves
{
get
{
return leaves;
}
}
private Nullable constant;
public bool Const
{
get
{
return constant.Value;
}
}
private Variable(string name, SyntaxToken[] leaves, Tree tree)
{
this.tree = tree;
this.Name = name;
this.leaves = new HashSet(leaves);
constant = true;
foreach (var leaf in leaves)
{
if (!tree.leaves[leaf].IsConst)
{
constant = false;
// If not constant the it is a decleration token
break;
}
}
}
public override int GetHashCode()
{
return this.Name.GetHashCode();
}
public bool IsLiteral()
{
return Tree.literals.Contains(tree.leaves[Leaves.First()].Kind);
}
internal static Boolean isMethodName(SyntaxToken token)
{
return token.Parent.IsKind(Microsoft.CodeAnalysis.CSharp.SyntaxKind.MethodDeclaration)
&& token.IsKind(Microsoft.CodeAnalysis.CSharp.SyntaxKind.IdentifierToken);
}
// Create a variable for each variable in scope from tokens while splitting identically named but differently scoped vars.
internal static IEnumerable CreateFromMethod(Tree methodTree)
{
var root = methodTree.nodes[methodTree.GetRoot()];
var leaves = root.Leaves.ToArray();
Dictionary tokenToName = new Dictionary();
Dictionary> nameToTokens = new Dictionary>();
foreach (SyntaxToken token in root.Leaves)
{
string name = methodTree.leaves[token].VariableName;
if (isMethodName(token))
{
name = Extractor.MethodNameConst;
}
tokenToName[token] = name;
if (!nameToTokens.ContainsKey(name))
nameToTokens[name] = new List();
nameToTokens[name].Add(token);
}
List results = new List();
foreach (SyntaxToken leaf in leaves)
{
string name = tokenToName[leaf];
SyntaxToken[] syntaxTokens = nameToTokens[name].ToArray();
var v = new Variable(name, syntaxTokens, methodTree);
//check if exists
var matches = results.Where(p => p.Name == name).ToList();
bool alreadyExists = (matches.Count != 0);
if (!alreadyExists)
{
results.Add(v);
}
}
return results;
}
}
}
}
================================================
FILE: CSharpExtractor/extract.py
================================================
#!/usr/bin/python
import itertools
import multiprocessing
import os
import sys
import shutil
import subprocess
from threading import Timer
import sys
from argparse import ArgumentParser
from subprocess import Popen, PIPE, STDOUT, call
def get_immediate_subdirectories(a_dir):
return [(os.path.join(a_dir, name)) for name in os.listdir(a_dir)
if os.path.isdir(os.path.join(a_dir, name))]
TMP_DIR = ""
def ParallelExtractDir(args, dir):
ExtractFeaturesForDir(args, dir, "")
def ExtractFeaturesForDir(args, dir, prefix):
command = ['dotnet', 'run', '--project', args.csproj,
'--max_length', str(args.max_path_length), '--max_width', str(args.max_path_width),
'--path', dir, '--threads', str(args.num_threads), '--ofile_name', str(args.ofile_name)]
# print command
# os.system(command)
kill = lambda process: process.kill()
sleeper = subprocess.Popen(command, stderr=subprocess.PIPE)
timer = Timer(600000, kill, [sleeper])
try:
timer.start()
_, stderr = sleeper.communicate()
finally:
timer.cancel()
if sleeper.poll() == 0:
if len(stderr) > 0:
print(sys.stderr, stderr)
else:
print(sys.stderr, 'dir: ' + str(dir) + ' was not completed in time')
failed = True
subdirs = get_immediate_subdirectories(dir)
for subdir in subdirs:
ExtractFeaturesForDir(args, subdir, prefix + dir.split('/')[-1] + '_')
if failed:
if os.path.exists(str(args.ofile_name)):
os.remove(str(args.ofile_name))
def ExtractFeaturesForDirsList(args, dirs):
global TMP_DIR
TMP_DIR = "./tmp/feature_extractor%d/" % (os.getpid())
if os.path.exists(TMP_DIR):
shutil.rmtree(TMP_DIR, ignore_errors=True)
os.makedirs(TMP_DIR)
try:
p = multiprocessing.Pool(4)
p.starmap(ParallelExtractDir, zip(itertools.repeat(args), dirs))
#for dir in dirs:
# ExtractFeaturesForDir(args, dir, '')
output_files = os.listdir(TMP_DIR)
for f in output_files:
os.system("cat %s/%s" % (TMP_DIR, f))
finally:
shutil.rmtree(TMP_DIR, ignore_errors=True)
if __name__ == '__main__':
parser = ArgumentParser()
parser.add_argument("-maxlen", "--max_path_length", dest="max_path_length", required=False, default=8)
parser.add_argument("-maxwidth", "--max_path_width", dest="max_path_width", required=False, default=2)
parser.add_argument("-threads", "--num_threads", dest="num_threads", required=False, default=64)
parser.add_argument("--csproj", dest="csproj", required=True)
parser.add_argument("-dir", "--dir", dest="dir", required=False)
parser.add_argument("-ofile_name", "--ofile_name", dest="ofile_name", required=True)
args = parser.parse_args()
if args.dir is not None:
subdirs = get_immediate_subdirectories(args.dir)
to_extract = subdirs
if len(subdirs) == 0:
to_extract = [args.dir.rstrip('/')]
ExtractFeaturesForDirsList(args, to_extract)
================================================
FILE: Input.java
================================================
public String getName() {
return name;
}
================================================
FILE: JavaExtractor/JPredict/.classpath
================================================
================================================
FILE: JavaExtractor/JPredict/.gitignore
================================================
/target/
================================================
FILE: JavaExtractor/JPredict/src/main/java/JavaExtractor/App.java
================================================
package JavaExtractor;
import JavaExtractor.Common.CommandLineValues;
import org.kohsuke.args4j.CmdLineException;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.LinkedList;
import java.util.List;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.ThreadPoolExecutor;
public class App {
private static CommandLineValues s_CommandLineValues;
public static void main(String[] args) {
try {
s_CommandLineValues = new CommandLineValues(args);
} catch (CmdLineException e) {
e.printStackTrace();
return;
}
if (s_CommandLineValues.File != null) {
ExtractFeaturesTask extractFeaturesTask = new ExtractFeaturesTask(s_CommandLineValues,
s_CommandLineValues.File.toPath());
extractFeaturesTask.processFile();
} else if (s_CommandLineValues.Dir != null) {
extractDir();
}
}
private static void extractDir() {
ThreadPoolExecutor executor = (ThreadPoolExecutor) Executors.newFixedThreadPool(s_CommandLineValues.NumThreads);
LinkedList tasks = new LinkedList<>();
try {
Files.walk(Paths.get(s_CommandLineValues.Dir)).filter(Files::isRegularFile)
.filter(p -> p.toString().toLowerCase().endsWith(".java")).forEach(f -> {
ExtractFeaturesTask task = new ExtractFeaturesTask(s_CommandLineValues, f);
tasks.add(task);
});
} catch (IOException e) {
e.printStackTrace();
return;
}
List> tasksResults = null;
try {
tasksResults = executor.invokeAll(tasks);
} catch (InterruptedException e) {
e.printStackTrace();
} finally {
executor.shutdown();
}
tasksResults.forEach(f -> {
try {
f.get();
} catch (InterruptedException | ExecutionException e) {
e.printStackTrace();
}
});
}
}
================================================
FILE: JavaExtractor/JPredict/src/main/java/JavaExtractor/Common/CommandLineValues.java
================================================
package JavaExtractor.Common;
import org.kohsuke.args4j.CmdLineException;
import org.kohsuke.args4j.CmdLineParser;
import org.kohsuke.args4j.Option;
import java.io.File;
/**
* This class handles the programs arguments.
*/
public class CommandLineValues {
@Option(name = "--file", required = false)
public File File = null;
@Option(name = "--dir", required = false, forbids = "--file")
public String Dir = null;
@Option(name = "--max_path_length", required = true)
public int MaxPathLength;
@Option(name = "--max_path_width", required = true)
public int MaxPathWidth;
@Option(name = "--num_threads", required = false)
public int NumThreads = 64;
@Option(name = "--min_code_len", required = false)
public int MinCodeLength = 1;
@Option(name = "--max_code_len", required = false)
public int MaxCodeLength = -1;
@Option(name = "--max_file_len", required = false)
public int MaxFileLength = -1;
@Option(name = "--pretty_print", required = false)
public boolean PrettyPrint = false;
@Option(name = "--max_child_id", required = false)
public int MaxChildId = 3;
@Option(name = "--json_output", required = false)
public boolean JsonOutput = false;
public CommandLineValues(String... args) throws CmdLineException {
CmdLineParser parser = new CmdLineParser(this);
try {
parser.parseArgument(args);
} catch (CmdLineException e) {
System.err.println(e.getMessage());
parser.printUsage(System.err);
throw e;
}
}
public CommandLineValues() {
}
}
================================================
FILE: JavaExtractor/JPredict/src/main/java/JavaExtractor/Common/Common.java
================================================
package JavaExtractor.Common;
import JavaExtractor.FeaturesEntities.Property;
import com.github.javaparser.ast.Node;
import com.github.javaparser.ast.UserDataKey;
import java.util.ArrayList;
import java.util.stream.Collectors;
import java.util.stream.Stream;
public final class Common {
public static final UserDataKey PropertyKey = new UserDataKey() {
};
public static final UserDataKey ChildId = new UserDataKey() {
};
public static final String EmptyString = "";
public static final String MethodDeclaration = "MethodDeclaration";
public static final String NameExpr = "NameExpr";
public static final String BlankWord = "BLANK";
public static final int c_MaxLabelLength = 50;
public static final String methodName = "METHOD_NAME";
public static final String internalSeparator = "|";
public static String normalizeName(String original, String defaultString) {
original = original.toLowerCase().replaceAll("\\\\n", "") // escaped new
// lines
.replaceAll("//s+", "") // whitespaces
.replaceAll("[\"',]", "") // quotes, apostrophies, commas
.replaceAll("\\P{Print}", ""); // unicode weird characters
String stripped = original.replaceAll("[^A-Za-z]", "");
if (stripped.length() == 0) {
String carefulStripped = original.replaceAll(" ", "_");
if (carefulStripped.length() == 0) {
return defaultString;
} else {
return carefulStripped;
}
} else {
return stripped;
}
}
public static boolean isMethod(Node node, String type) {
Property parentProperty = node.getParentNode().getUserData(Common.PropertyKey);
if (parentProperty == null) {
return false;
}
String parentType = parentProperty.getType();
return Common.NameExpr.equals(type) && Common.MethodDeclaration.equals(parentType);
}
public static ArrayList splitToSubtokens(String str1) {
String str2 = str1.replace("|", " ");
String str3 = str2.trim();
return Stream.of(str3.split("(?<=[a-z])(?=[A-Z])|_|[0-9]|(?<=[A-Z])(?=[A-Z][a-z])|\\s+"))
.filter(s -> s.length() > 0).map(s -> Common.normalizeName(s, Common.EmptyString))
.filter(s -> s.length() > 0).collect(Collectors.toCollection(ArrayList::new));
}
}
================================================
FILE: JavaExtractor/JPredict/src/main/java/JavaExtractor/Common/MethodContent.java
================================================
package JavaExtractor.Common;
import com.github.javaparser.ast.Node;
import java.util.ArrayList;
public class MethodContent {
private final ArrayList leaves;
private final String name;
private final String content;
public MethodContent(ArrayList leaves, String name, String content) {
this.leaves = leaves;
this.name = name;
this.content = content;
}
public ArrayList getLeaves() {
return leaves;
}
public String getName() {
return name;
}
public String getContent() {
return content;
}
}
================================================
FILE: JavaExtractor/JPredict/src/main/java/JavaExtractor/ExtractFeaturesTask.java
================================================
package JavaExtractor;
import JavaExtractor.Common.CommandLineValues;
import JavaExtractor.Common.Common;
import JavaExtractor.FeaturesEntities.ProgramFeatures;
import org.apache.commons.lang3.StringUtils;
import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.Callable;
import com.google.gson.Gson;
class ExtractFeaturesTask implements Callable {
private final CommandLineValues commandLineValues;
private final Path filePath;
public ExtractFeaturesTask(CommandLineValues commandLineValues, Path path) {
this.commandLineValues = commandLineValues;
this.filePath = path;
}
@Override
public Void call() {
processFile();
return null;
}
public void processFile() {
ArrayList features;
try {
features = extractSingleFile();
} catch (IOException e) {
e.printStackTrace();
return;
}
if (features == null) {
return;
}
String toPrint = featuresToString(features);
if (toPrint.length() > 0) {
System.out.println(toPrint);
}
}
private ArrayList extractSingleFile() throws IOException {
String code;
if (commandLineValues.MaxFileLength > 0 &&
Files.lines(filePath, Charset.defaultCharset()).count() > commandLineValues.MaxFileLength) {
return new ArrayList<>();
}
try {
code = new String(Files.readAllBytes(filePath));
} catch (IOException e) {
e.printStackTrace();
code = Common.EmptyString;
}
FeatureExtractor featureExtractor = new FeatureExtractor(commandLineValues, this.filePath);
return featureExtractor.extractFeatures(code);
}
public String featuresToString(ArrayList features) {
if (features == null || features.isEmpty()) {
return Common.EmptyString;
}
List methodsOutputs = new ArrayList<>();
for (ProgramFeatures singleMethodFeatures : features) {
StringBuilder builder = new StringBuilder();
String toPrint;
if (commandLineValues.JsonOutput) {
toPrint = new Gson().toJson(singleMethodFeatures);
}
else {
toPrint = singleMethodFeatures.toString();
}
if (commandLineValues.PrettyPrint) {
toPrint = toPrint.replace(" ", "\n\t");
}
builder.append(toPrint);
methodsOutputs.add(builder.toString());
}
return StringUtils.join(methodsOutputs, "\n");
}
}
================================================
FILE: JavaExtractor/JPredict/src/main/java/JavaExtractor/FeatureExtractor.java
================================================
package JavaExtractor;
import JavaExtractor.Common.CommandLineValues;
import JavaExtractor.Common.Common;
import JavaExtractor.Common.MethodContent;
import JavaExtractor.FeaturesEntities.ProgramFeatures;
import JavaExtractor.FeaturesEntities.Property;
import JavaExtractor.Visitors.FunctionVisitor;
import com.github.javaparser.JavaParser;
import com.github.javaparser.ParseProblemException;
import com.github.javaparser.ast.CompilationUnit;
import com.github.javaparser.ast.Node;
import java.io.File;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Set;
import java.util.StringJoiner;
import java.util.stream.Collectors;
import java.util.stream.Stream;
@SuppressWarnings("StringEquality")
class FeatureExtractor {
private final static String upSymbol = "|";
private final static String downSymbol = "|";
private static final Set s_ParentTypeToAddChildId = Stream
.of("AssignExpr", "ArrayAccessExpr", "FieldAccessExpr", "MethodCallExpr")
.collect(Collectors.toCollection(HashSet::new));
private final CommandLineValues m_CommandLineValues;
private final Path filePath;
public FeatureExtractor(CommandLineValues commandLineValues, Path filePath) {
this.m_CommandLineValues = commandLineValues;
this.filePath = filePath;
}
private static ArrayList getTreeStack(Node node) {
ArrayList upStack = new ArrayList<>();
Node current = node;
while (current != null) {
upStack.add(current);
current = current.getParentNode();
}
return upStack;
}
public ArrayList extractFeatures(String code) {
CompilationUnit m_CompilationUnit = parseFileWithRetries(code);
FunctionVisitor functionVisitor = new FunctionVisitor(m_CommandLineValues);
functionVisitor.visit(m_CompilationUnit, null);
ArrayList methods = functionVisitor.getMethodContents();
return generatePathFeatures(methods);
}
private CompilationUnit parseFileWithRetries(String code) {
final String classPrefix = "public class Test {";
final String classSuffix = "}";
final String methodPrefix = "SomeUnknownReturnType f() {";
final String methodSuffix = "return noSuchReturnValue; }";
String content = code;
CompilationUnit parsed;
try {
parsed = JavaParser.parse(content);
} catch (ParseProblemException e1) {
// Wrap with a class and method
try {
content = classPrefix + methodPrefix + code + methodSuffix + classSuffix;
parsed = JavaParser.parse(content);
} catch (ParseProblemException e2) {
// Wrap with a class only
content = classPrefix + code + classSuffix;
parsed = JavaParser.parse(content);
}
}
return parsed;
}
private ArrayList generatePathFeatures(ArrayList methods) {
ArrayList methodsFeatures = new ArrayList<>();
for (MethodContent content : methods) {
ProgramFeatures singleMethodFeatures = generatePathFeaturesForFunction(content);
if (!singleMethodFeatures.isEmpty()) {
methodsFeatures.add(singleMethodFeatures);
}
}
return methodsFeatures;
}
private ProgramFeatures generatePathFeaturesForFunction(MethodContent methodContent) {
ArrayList functionLeaves = methodContent.getLeaves();
ProgramFeatures programFeatures = new ProgramFeatures(
methodContent.getName(), this.filePath, methodContent.getContent());
for (int i = 0; i < functionLeaves.size(); i++) {
for (int j = i + 1; j < functionLeaves.size(); j++) {
String separator = Common.EmptyString;
String path = generatePath(functionLeaves.get(i), functionLeaves.get(j), separator);
if (path != Common.EmptyString) {
Property source = functionLeaves.get(i).getUserData(Common.PropertyKey);
Property target = functionLeaves.get(j).getUserData(Common.PropertyKey);
programFeatures.addFeature(source, path, target);
}
}
}
return programFeatures;
}
private String generatePath(Node source, Node target, String separator) {
StringJoiner stringBuilder = new StringJoiner(separator);
ArrayList sourceStack = getTreeStack(source);
ArrayList targetStack = getTreeStack(target);
int commonPrefix = 0;
int currentSourceAncestorIndex = sourceStack.size() - 1;
int currentTargetAncestorIndex = targetStack.size() - 1;
while (currentSourceAncestorIndex >= 0 && currentTargetAncestorIndex >= 0
&& sourceStack.get(currentSourceAncestorIndex) == targetStack.get(currentTargetAncestorIndex)) {
commonPrefix++;
currentSourceAncestorIndex--;
currentTargetAncestorIndex--;
}
int pathLength = sourceStack.size() + targetStack.size() - 2 * commonPrefix;
if (pathLength > m_CommandLineValues.MaxPathLength) {
return Common.EmptyString;
}
if (currentSourceAncestorIndex >= 0 && currentTargetAncestorIndex >= 0) {
int pathWidth = targetStack.get(currentTargetAncestorIndex).getUserData(Common.ChildId)
- sourceStack.get(currentSourceAncestorIndex).getUserData(Common.ChildId);
if (pathWidth > m_CommandLineValues.MaxPathWidth) {
return Common.EmptyString;
}
}
for (int i = 0; i < sourceStack.size() - commonPrefix; i++) {
Node currentNode = sourceStack.get(i);
String childId = Common.EmptyString;
String parentRawType = currentNode.getParentNode().getUserData(Common.PropertyKey).getRawType();
if (i == 0 || s_ParentTypeToAddChildId.contains(parentRawType)) {
childId = saturateChildId(currentNode.getUserData(Common.ChildId))
.toString();
}
stringBuilder.add(String.format("%s%s%s",
currentNode.getUserData(Common.PropertyKey).getType(true), childId, upSymbol));
}
Node commonNode = sourceStack.get(sourceStack.size() - commonPrefix);
String commonNodeChildId = Common.EmptyString;
Property parentNodeProperty = commonNode.getParentNode().getUserData(Common.PropertyKey);
String commonNodeParentRawType = Common.EmptyString;
if (parentNodeProperty != null) {
commonNodeParentRawType = parentNodeProperty.getRawType();
}
if (s_ParentTypeToAddChildId.contains(commonNodeParentRawType)) {
commonNodeChildId = saturateChildId(commonNode.getUserData(Common.ChildId))
.toString();
}
stringBuilder.add(String.format("%s%s",
commonNode.getUserData(Common.PropertyKey).getType(true), commonNodeChildId));
for (int i = targetStack.size() - commonPrefix - 1; i >= 0; i--) {
Node currentNode = targetStack.get(i);
String childId = Common.EmptyString;
if (i == 0 || s_ParentTypeToAddChildId.contains(currentNode.getUserData(Common.PropertyKey).getRawType())) {
childId = saturateChildId(currentNode.getUserData(Common.ChildId))
.toString();
}
stringBuilder.add(String.format("%s%s%s", downSymbol,
currentNode.getUserData(Common.PropertyKey).getType(true), childId));
}
return stringBuilder.toString();
}
private Integer saturateChildId(int childId) {
return Math.min(childId, m_CommandLineValues.MaxChildId);
}
}
================================================
FILE: JavaExtractor/JPredict/src/main/java/JavaExtractor/FeaturesEntities/ProgramFeatures.java
================================================
package JavaExtractor.FeaturesEntities;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.stream.Collectors;
public class ProgramFeatures {
String name;
transient ArrayList features = new ArrayList<>();
String textContent;
String filePath;
public ProgramFeatures(String name, Path filePath, String textContent) {
this.name = name;
this.filePath = filePath.toAbsolutePath().toString();
this.textContent = textContent;
}
@SuppressWarnings("StringBufferReplaceableByString")
@Override
public String toString() {
StringBuilder stringBuilder = new StringBuilder();
stringBuilder.append(name).append(" ");
stringBuilder.append(features.stream().map(ProgramRelation::toString).collect(Collectors.joining(" ")));
return stringBuilder.toString();
}
public void addFeature(Property source, String path, Property target) {
ProgramRelation newRelation = new ProgramRelation(source, target, path);
features.add(newRelation);
}
public boolean isEmpty() {
return features.isEmpty();
}
}
================================================
FILE: JavaExtractor/JPredict/src/main/java/JavaExtractor/FeaturesEntities/ProgramRelation.java
================================================
package JavaExtractor.FeaturesEntities;
public class ProgramRelation {
Property source;
Property target;
String path;
public ProgramRelation(Property sourceName, Property targetName, String path) {
source = sourceName;
target = targetName;
this.path = path;
}
public String toString() {
return String.format("%s,%s,%s", source.getName(), path,
target.getName());
}
}
================================================
FILE: JavaExtractor/JPredict/src/main/java/JavaExtractor/FeaturesEntities/Property.java
================================================
package JavaExtractor.FeaturesEntities;
import JavaExtractor.Common.Common;
import com.github.javaparser.ast.Node;
import com.github.javaparser.ast.expr.AssignExpr;
import com.github.javaparser.ast.expr.BinaryExpr;
import com.github.javaparser.ast.expr.IntegerLiteralExpr;
import com.github.javaparser.ast.expr.UnaryExpr;
import com.github.javaparser.ast.type.ClassOrInterfaceType;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.Stream;
public class Property {
public static final HashSet NumericalKeepValues = Stream.of("0", "1", "32", "64")
.collect(Collectors.toCollection(HashSet::new));
private static final Map shortTypes = Collections.unmodifiableMap(new HashMap() {
/**
*
*/
private static final long serialVersionUID = 1L;
{
put("ArrayAccessExpr", "ArAc");
put("ArrayBracketPair", "ArBr");
put("ArrayCreationExpr", "ArCr");
put("ArrayCreationLevel", "ArCrLvl");
put("ArrayInitializerExpr", "ArIn");
put("ArrayType", "ArTy");
put("AssertStmt", "Asrt");
put("AssignExpr:and", "AsAn");
put("AssignExpr:assign", "As");
put("AssignExpr:lShift", "AsLS");
put("AssignExpr:minus", "AsMi");
put("AssignExpr:or", "AsOr");
put("AssignExpr:plus", "AsP");
put("AssignExpr:rem", "AsRe");
put("AssignExpr:rSignedShift", "AsRSS");
put("AssignExpr:rUnsignedShift", "AsRUS");
put("AssignExpr:slash", "AsSl");
put("AssignExpr:star", "AsSt");
put("AssignExpr:xor", "AsX");
put("BinaryExpr:and", "And");
put("BinaryExpr:binAnd", "BinAnd");
put("BinaryExpr:binOr", "BinOr");
put("BinaryExpr:divide", "Div");
put("BinaryExpr:equals", "Eq");
put("BinaryExpr:greater", "Gt");
put("BinaryExpr:greaterEquals", "Geq");
put("BinaryExpr:less", "Ls");
put("BinaryExpr:lessEquals", "Leq");
put("BinaryExpr:lShift", "LS");
put("BinaryExpr:minus", "Minus");
put("BinaryExpr:notEquals", "Neq");
put("BinaryExpr:or", "Or");
put("BinaryExpr:plus", "Plus");
put("BinaryExpr:remainder", "Mod");
put("BinaryExpr:rSignedShift", "RSS");
put("BinaryExpr:rUnsignedShift", "RUS");
put("BinaryExpr:times", "Mul");
put("BinaryExpr:xor", "Xor");
put("BlockStmt", "Bk");
put("BooleanLiteralExpr", "BoolEx");
put("CastExpr", "Cast");
put("CatchClause", "Catch");
put("CharLiteralExpr", "CharEx");
put("ClassExpr", "ClsEx");
put("ClassOrInterfaceDeclaration", "ClsD");
put("ClassOrInterfaceType", "Cls");
put("ConditionalExpr", "Cond");
put("ConstructorDeclaration", "Ctor");
put("DoStmt", "Do");
put("DoubleLiteralExpr", "Dbl");
put("EmptyMemberDeclaration", "Emp");
put("EnclosedExpr", "Enc");
put("ExplicitConstructorInvocationStmt", "ExpCtor");
put("ExpressionStmt", "Ex");
put("FieldAccessExpr", "Fld");
put("FieldDeclaration", "FldDec");
put("ForeachStmt", "Foreach");
put("ForStmt", "For");
put("IfStmt", "If");
put("InitializerDeclaration", "Init");
put("InstanceOfExpr", "InstanceOf");
put("IntegerLiteralExpr", "IntEx");
put("IntegerLiteralMinValueExpr", "IntMinEx");
put("LabeledStmt", "Labeled");
put("LambdaExpr", "Lambda");
put("LongLiteralExpr", "LongEx");
put("MarkerAnnotationExpr", "MarkerExpr");
put("MemberValuePair", "Mvp");
put("MethodCallExpr", "Cal");
put("MethodDeclaration", "Mth");
put("MethodReferenceExpr", "MethRef");
put("NameExpr", "Nm");
put("NormalAnnotationExpr", "NormEx");
put("NullLiteralExpr", "Null");
put("ObjectCreationExpr", "ObjEx");
put("Parameter", "Prm");
put("PrimitiveType", "Prim");
put("QualifiedNameExpr", "Qua");
put("ReturnStmt", "Ret");
put("SingleMemberAnnotationExpr", "SMEx");
put("StringLiteralExpr", "StrEx");
put("SuperExpr", "SupEx");
put("SwitchEntryStmt", "SwiEnt");
put("SwitchStmt", "Switch");
put("SynchronizedStmt", "Sync");
put("ThisExpr", "This");
put("ThrowStmt", "Thro");
put("TryStmt", "Try");
put("TypeDeclarationStmt", "TypeDec");
put("TypeExpr", "Type");
put("TypeParameter", "TypePar");
put("UnaryExpr:inverse", "Inverse");
put("UnaryExpr:negative", "Neg");
put("UnaryExpr:not", "Not");
put("UnaryExpr:posDecrement", "PosDec");
put("UnaryExpr:posIncrement", "PosInc");
put("UnaryExpr:positive", "Pos");
put("UnaryExpr:preDecrement", "PreDec");
put("UnaryExpr:preIncrement", "PreInc");
put("UnionType", "Unio");
put("VariableDeclarationExpr", "VDE");
put("VariableDeclarator", "VD");
put("VariableDeclaratorId", "VDID");
put("VoidType", "Void");
put("WhileStmt", "While");
put("WildcardType", "Wild");
}
});
private final String RawType;
private String Type;
private String SplitName;
public Property(Node node, boolean isLeaf, boolean isGenericParent) {
Class> nodeClass = node.getClass();
RawType = Type = nodeClass.getSimpleName();
if (node instanceof ClassOrInterfaceType && ((ClassOrInterfaceType) node).isBoxedType()) {
Type = "PrimitiveType";
}
String operator = "";
if (node instanceof BinaryExpr) {
operator = ((BinaryExpr) node).getOperator().toString();
} else if (node instanceof UnaryExpr) {
operator = ((UnaryExpr) node).getOperator().toString();
} else if (node instanceof AssignExpr) {
operator = ((AssignExpr) node).getOperator().toString();
}
if (operator.length() > 0) {
Type += ":" + operator;
}
String nameToSplit = node.toString();
if (isGenericParent) {
nameToSplit = ((ClassOrInterfaceType) node).getName();
if (isLeaf) {
// if it is a generic parent which counts as a leaf, then when
// it is participating in a path
// as a parent, it should be GenericClass and not a simple
// ClassOrInterfaceType.
Type = "GenericClass";
}
}
ArrayList splitNameParts = Common.splitToSubtokens(nameToSplit);
SplitName = String.join(Common.internalSeparator, splitNameParts);
String name = Common.normalizeName(node.toString(), Common.BlankWord);
if (name.length() > Common.c_MaxLabelLength) {
name = name.substring(0, Common.c_MaxLabelLength);
} else if (node instanceof ClassOrInterfaceType && ((ClassOrInterfaceType) node).isBoxedType()) {
name = ((ClassOrInterfaceType) node).toUnboxedType().toString();
}
if (Common.isMethod(node, Type)) {
name = SplitName = Common.methodName;
}
if (SplitName.length() == 0) {
SplitName = name;
if (node instanceof IntegerLiteralExpr && !NumericalKeepValues.contains(SplitName)) {
// This is a numeric literal, but not in our white list
SplitName = "";
}
}
}
public String getRawType() {
return RawType;
}
public String getType() {
return Type;
}
public String getType(boolean shorten) {
if (shorten) {
return shortTypes.getOrDefault(Type, Type);
} else {
return Type;
}
}
public String getName() {
return SplitName;
}
}
================================================
FILE: JavaExtractor/JPredict/src/main/java/JavaExtractor/Visitors/FunctionVisitor.java
================================================
package JavaExtractor.Visitors;
import JavaExtractor.Common.CommandLineValues;
import JavaExtractor.Common.Common;
import JavaExtractor.Common.MethodContent;
import com.github.javaparser.ast.Node;
import com.github.javaparser.ast.body.MethodDeclaration;
import com.github.javaparser.ast.visitor.VoidVisitorAdapter;
import java.util.ArrayList;
import java.util.Arrays;
@SuppressWarnings("StringEquality")
public class FunctionVisitor extends VoidVisitorAdapter